aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/.gitignore7
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/ABI/obsolete/sysfs-bus-usb31
-rw-r--r--Documentation/ABI/obsolete/sysfs-class-rfkill29
-rw-r--r--Documentation/ABI/stable/sysfs-class-rfkill67
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci40
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb28
-rw-r--r--Documentation/ABI/testing/sysfs-class-power20
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory2
-rw-r--r--Documentation/ABI/testing/sysfs-devices-node7
-rw-r--r--Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget9
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-picolcd43
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-prodikeys29
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-roccat-kone111
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-sfi15
-rw-r--r--Documentation/ABI/testing/sysfs-wacom10
-rw-r--r--Documentation/Changes2
-rw-r--r--Documentation/DMA-API-HOWTO.txt81
-rw-r--r--Documentation/DocBook/kgdb.tmpl692
-rw-r--r--Documentation/DocBook/libata.tmpl20
-rw-r--r--Documentation/DocBook/media-entities.tmpl11
-rw-r--r--Documentation/DocBook/mtdnand.tmpl2
-rw-r--r--Documentation/DocBook/sh.tmpl10
-rw-r--r--Documentation/DocBook/v4l/compat.xml126
-rw-r--r--Documentation/DocBook/v4l/controls.xml36
-rw-r--r--Documentation/DocBook/v4l/dev-event.xml31
-rw-r--r--Documentation/DocBook/v4l/io.xml18
-rw-r--r--Documentation/DocBook/v4l/pixfmt.xml12
-rw-r--r--Documentation/DocBook/v4l/v4l2.xml3
-rw-r--r--Documentation/DocBook/v4l/videodev2.h.xml10
-rw-r--r--Documentation/DocBook/v4l/vidioc-dqevent.xml131
-rw-r--r--Documentation/DocBook/v4l/vidioc-enuminput.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-qbuf.xml14
-rw-r--r--Documentation/DocBook/v4l/vidioc-queryctrl.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-reqbufs.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-subscribe-event.xml133
-rw-r--r--Documentation/DocBook/writing-an-alsa-driver.tmpl27
-rw-r--r--Documentation/DocBook/writing_usb_driver.tmpl2
-rw-r--r--Documentation/PCI/pci-error-recovery.txt4
-rw-r--r--Documentation/PCI/pcieaer-howto.txt29
-rw-r--r--Documentation/RCU/stallwarn.txt94
-rw-r--r--Documentation/RCU/torture.txt10
-rw-r--r--Documentation/RCU/trace.txt35
-rw-r--r--Documentation/Smack.txt2
-rw-r--r--Documentation/SubmitChecklist12
-rw-r--r--Documentation/SubmittingDrivers5
-rw-r--r--Documentation/acpi/apei/einj.txt59
-rw-r--r--Documentation/arm/00-INDEX2
-rw-r--r--Documentation/arm/SA1100/ADSBitsy2
-rw-r--r--Documentation/arm/SPEAr/overview.txt60
-rw-r--r--Documentation/arm/Samsung-S3C24XX/GPIO.txt81
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Overview.txt15
-rw-r--r--Documentation/arm/Samsung/GPIO.txt42
-rw-r--r--Documentation/arm/Samsung/Overview.txt33
-rw-r--r--Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen2
-rw-r--r--Documentation/atomic_ops.txt2
-rw-r--r--Documentation/blackfin/bfin-gpio-notes.txt2
-rw-r--r--Documentation/cachetlb.txt6
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--Documentation/cgroups/cgroups.txt4
-rw-r--r--Documentation/cgroups/cpusets.txt38
-rw-r--r--Documentation/cgroups/memcg_test.txt2
-rw-r--r--Documentation/cgroups/memory.txt328
-rw-r--r--Documentation/connector/connector.txt2
-rw-r--r--Documentation/credentials.txt14
-rw-r--r--Documentation/development-process/2.Process29
-rw-r--r--Documentation/development-process/7.AdvancedTopics2
-rw-r--r--Documentation/devices.txt2
-rw-r--r--Documentation/dvb/ci.txt2
-rw-r--r--Documentation/dvb/contributors.txt2
-rw-r--r--Documentation/feature-removal-schedule.txt121
-rw-r--r--Documentation/filesystems/Locking9
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/ceph.txt2
-rw-r--r--Documentation/filesystems/dlmfs.txt2
-rw-r--r--Documentation/filesystems/ext3.txt15
-rw-r--r--Documentation/filesystems/fiemap.txt12
-rw-r--r--Documentation/filesystems/fuse.txt4
-rw-r--r--Documentation/filesystems/gfs2.txt12
-rw-r--r--Documentation/filesystems/hpfs.txt2
-rw-r--r--Documentation/filesystems/logfs.txt8
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt2
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt2
-rw-r--r--Documentation/filesystems/nilfs2.txt4
-rw-r--r--Documentation/filesystems/ocfs2.txt7
-rw-r--r--Documentation/filesystems/proc.txt13
-rw-r--r--Documentation/filesystems/smbfs.txt2
-rw-r--r--Documentation/filesystems/squashfs.txt32
-rw-r--r--Documentation/filesystems/sysfs-tagging.txt42
-rw-r--r--Documentation/filesystems/tmpfs.txt10
-rw-r--r--Documentation/filesystems/vfs.txt11
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.txt816
-rw-r--r--Documentation/hwmon/dme173751
-rw-r--r--Documentation/hwmon/lm637
-rw-r--r--Documentation/hwmon/lm852
-rw-r--r--Documentation/hwmon/ltc42454
-rw-r--r--Documentation/hwmon/sysfs-interface13
-rw-r--r--Documentation/hwmon/tmp10226
-rw-r--r--Documentation/i2c/busses/i2c-i8018
-rw-r--r--Documentation/input/joystick.txt2
-rw-r--r--Documentation/intel_txt.txt18
-rw-r--r--Documentation/kbuild/kbuild.txt6
-rw-r--r--Documentation/kbuild/kconfig-language.txt2
-rw-r--r--Documentation/kbuild/kconfig.txt2
-rw-r--r--Documentation/kernel-docs.txt10
-rw-r--r--Documentation/kernel-parameters.txt79
-rw-r--r--Documentation/kprobes.txt12
-rw-r--r--Documentation/kvm/api.txt208
-rw-r--r--Documentation/kvm/cpuid.txt42
-rw-r--r--Documentation/kvm/mmu.txt304
-rw-r--r--Documentation/laptops/laptop-mode.txt2
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt66
-rw-r--r--Documentation/lguest/lguest.c2
-rw-r--r--Documentation/md.txt2
-rw-r--r--Documentation/mutex-design.txt4
-rw-r--r--Documentation/netlabel/lsm_interface.txt2
-rw-r--r--Documentation/networking/caif/Linux-CAIF.txt212
-rw-r--r--Documentation/networking/caif/README109
-rw-r--r--Documentation/networking/ifenslave.c2
-rw-r--r--Documentation/networking/ip-sysctl.txt31
-rw-r--r--Documentation/networking/l2tp.txt247
-rw-r--r--Documentation/networking/packet_mmap.txt4
-rw-r--r--Documentation/networking/x25-iface.txt16
-rw-r--r--Documentation/oops-tracing.txt4
-rw-r--r--Documentation/padata.txt107
-rw-r--r--Documentation/pcmcia/driver-changes.txt13
-rw-r--r--Documentation/power/devices.txt847
-rw-r--r--Documentation/power/pci.txt1258
-rw-r--r--Documentation/power/pm_qos_interface.txt48
-rw-r--r--Documentation/power/regulator/consumer.txt10
-rw-r--r--Documentation/power/regulator/machine.txt2
-rw-r--r--Documentation/power/regulator/overview.txt6
-rw-r--r--Documentation/power/userland-swsusp.txt4
-rw-r--r--Documentation/powerpc/booting-without-of.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/4xx/reboot.txt18
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt22
-rw-r--r--Documentation/powerpc/dts-bindings/xilinx.txt2
-rw-r--r--Documentation/powerpc/phyp-assisted-dump.txt2
-rw-r--r--Documentation/rbtree.txt58
-rw-r--r--Documentation/rfkill.txt44
-rw-r--r--Documentation/rt-mutex-design.txt2
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt54
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/scsi/ChangeLog.lpfc4
-rw-r--r--Documentation/scsi/FlashPoint.txt2
-rw-r--r--Documentation/scsi/dtc3x80.txt2
-rw-r--r--Documentation/scsi/ncr53c8xx.txt2
-rw-r--r--Documentation/scsi/osst.txt2
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt4
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt2
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt31
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt4
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt4
-rw-r--r--Documentation/sound/alsa/soc/machine.txt2
-rw-r--r--Documentation/sound/alsa/soc/overview.txt2
-rw-r--r--Documentation/sparse.txt4
-rw-r--r--Documentation/spi/ep93xx_spi95
-rw-r--r--Documentation/spi/spidev_fdx.c4
-rw-r--r--Documentation/sysctl/net.txt10
-rw-r--r--Documentation/sysctl/vm.txt25
-rw-r--r--Documentation/sysfs-rules.txt2
-rw-r--r--Documentation/sysrq.txt14
-rw-r--r--Documentation/timers/Makefile2
-rw-r--r--Documentation/timers/hpet_example.c2
-rw-r--r--Documentation/trace/events.txt11
-rw-r--r--Documentation/trace/ftrace.txt50
-rw-r--r--Documentation/trace/kprobetrace.txt4
-rw-r--r--Documentation/usb/WUSB-Design-overview.txt2
-rw-r--r--Documentation/usb/bulk-streams.txt78
-rw-r--r--Documentation/usb/dma.txt22
-rw-r--r--Documentation/usb/gadget_hid.txt445
-rw-r--r--Documentation/usb/power-management.txt19
-rw-r--r--Documentation/usb/usb-serial.txt29
-rw-r--r--Documentation/video4linux/CARDLIST.bttv2
-rw-r--r--Documentation/video4linux/CARDLIST.cx881
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx4
-rw-r--r--Documentation/video4linux/CARDLIST.saa71343
-rw-r--r--Documentation/video4linux/extract_xc3028.pl817
-rw-r--r--Documentation/video4linux/gspca.txt5
-rw-r--r--Documentation/video4linux/sh_mobile_ceu_camera.txt80
-rw-r--r--Documentation/video4linux/v4l2-framework.txt143
-rw-r--r--Documentation/vm/map_hugetlb.c2
-rw-r--r--Documentation/vm/numa186
-rw-r--r--Documentation/vm/numa_memory_policy.txt4
-rw-r--r--Documentation/w1/w1.generic2
-rw-r--r--Documentation/watchdog/00-INDEX5
-rw-r--r--Documentation/watchdog/watchdog-parameters.txt390
-rw-r--r--Documentation/watchdog/wdt.txt15
188 files changed, 8845 insertions, 1731 deletions
diff --git a/Documentation/.gitignore b/Documentation/.gitignore
new file mode 100644
index 000000000000..bcd907b4141f
--- /dev/null
+++ b/Documentation/.gitignore
@@ -0,0 +1,7 @@
1filesystems/dnotify_test
2laptops/dslm
3timers/hpet_example
4vm/hugepage-mmap
5vm/hugepage-shm
6vm/map_hugetlb
7
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 06b982affe76..dd10b51b4e65 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -250,6 +250,8 @@ numastat.txt
250 - info on how to read Numa policy hit/miss statistics in sysfs. 250 - info on how to read Numa policy hit/miss statistics in sysfs.
251oops-tracing.txt 251oops-tracing.txt
252 - how to decode those nasty internal kernel error dump messages. 252 - how to decode those nasty internal kernel error dump messages.
253padata.txt
254 - An introduction to the "padata" parallel execution API
253parisc/ 255parisc/
254 - directory with info on using Linux on PA-RISC architecture. 256 - directory with info on using Linux on PA-RISC architecture.
255parport.txt 257parport.txt
diff --git a/Documentation/ABI/obsolete/sysfs-bus-usb b/Documentation/ABI/obsolete/sysfs-bus-usb
new file mode 100644
index 000000000000..bd096d33fbc7
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-bus-usb
@@ -0,0 +1,31 @@
1What: /sys/bus/usb/devices/.../power/level
2Date: March 2007
3KernelVersion: 2.6.21
4Contact: Alan Stern <stern@rowland.harvard.edu>
5Description:
6 Each USB device directory will contain a file named
7 power/level. This file holds a power-level setting for
8 the device, either "on" or "auto".
9
10 "on" means that the device is not allowed to autosuspend,
11 although normal suspends for system sleep will still
12 be honored. "auto" means the device will autosuspend
13 and autoresume in the usual manner, according to the
14 capabilities of its driver.
15
16 During normal use, devices should be left in the "auto"
17 level. The "on" level is meant for administrative uses.
18 If you want to suspend a device immediately but leave it
19 free to wake up in response to I/O requests, you should
20 write "0" to power/autosuspend.
21
22 Device not capable of proper suspend and resume should be
23 left in the "on" level. Although the USB spec requires
24 devices to support suspend/resume, many of them do not.
25 In fact so many don't that by default, the USB core
26 initializes all non-hub devices in the "on" level. Some
27 drivers may change this setting when they are bound.
28
29 This file is deprecated and will be removed after 2010.
30 Use the power/control file instead; it does exactly the
31 same thing.
diff --git a/Documentation/ABI/obsolete/sysfs-class-rfkill b/Documentation/ABI/obsolete/sysfs-class-rfkill
new file mode 100644
index 000000000000..4201d5b05515
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-class-rfkill
@@ -0,0 +1,29 @@
1rfkill - radio frequency (RF) connector kill switch support
2
3For details to this subsystem look at Documentation/rfkill.txt.
4
5What: /sys/class/rfkill/rfkill[0-9]+/state
6Date: 09-Jul-2007
7KernelVersion v2.6.22
8Contact: linux-wireless@vger.kernel.org
9Description: Current state of the transmitter.
10 This file is deprecated and sheduled to be removed in 2014,
11 because its not possible to express the 'soft and hard block'
12 state of the rfkill driver.
13Values: A numeric value.
14 0: RFKILL_STATE_SOFT_BLOCKED
15 transmitter is turned off by software
16 1: RFKILL_STATE_UNBLOCKED
17 transmitter is (potentially) active
18 2: RFKILL_STATE_HARD_BLOCKED
19 transmitter is forced off by something outside of
20 the driver's control.
21
22What: /sys/class/rfkill/rfkill[0-9]+/claim
23Date: 09-Jul-2007
24KernelVersion v2.6.22
25Contact: linux-wireless@vger.kernel.org
26Description: This file is deprecated because there no longer is a way to
27 claim just control over a single rfkill instance.
28 This file is scheduled to be removed in 2012.
29Values: 0: Kernel handles events
diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill
new file mode 100644
index 000000000000..097f522c33bb
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-class-rfkill
@@ -0,0 +1,67 @@
1rfkill - radio frequency (RF) connector kill switch support
2
3For details to this subsystem look at Documentation/rfkill.txt.
4
5For the deprecated /sys/class/rfkill/*/state and
6/sys/class/rfkill/*/claim knobs of this interface look in
7Documentation/ABI/obsolete/sysfs-class-rfkill.
8
9What: /sys/class/rfkill
10Date: 09-Jul-2007
11KernelVersion: v2.6.22
12Contact: linux-wireless@vger.kernel.org,
13Description: The rfkill class subsystem folder.
14 Each registered rfkill driver is represented by an rfkillX
15 subfolder (X being an integer > 0).
16
17
18What: /sys/class/rfkill/rfkill[0-9]+/name
19Date: 09-Jul-2007
20KernelVersion v2.6.22
21Contact: linux-wireless@vger.kernel.org
22Description: Name assigned by driver to this key (interface or driver name).
23Values: arbitrary string.
24
25
26What: /sys/class/rfkill/rfkill[0-9]+/type
27Date: 09-Jul-2007
28KernelVersion v2.6.22
29Contact: linux-wireless@vger.kernel.org
30Description: Driver type string ("wlan", "bluetooth", etc).
31Values: See include/linux/rfkill.h.
32
33
34What: /sys/class/rfkill/rfkill[0-9]+/persistent
35Date: 09-Jul-2007
36KernelVersion v2.6.22
37Contact: linux-wireless@vger.kernel.org
38Description: Whether the soft blocked state is initialised from non-volatile
39 storage at startup.
40Values: A numeric value.
41 0: false
42 1: true
43
44
45What: /sys/class/rfkill/rfkill[0-9]+/hard
46Date: 12-March-2010
47KernelVersion v2.6.34
48Contact: linux-wireless@vger.kernel.org
49Description: Current hardblock state. This file is read only.
50Values: A numeric value.
51 0: inactive
52 The transmitter is (potentially) active.
53 1: active
54 The transmitter is forced off by something outside of
55 the driver's control.
56
57
58What: /sys/class/rfkill/rfkill[0-9]+/soft
59Date: 12-March-2010
60KernelVersion v2.6.34
61Contact: linux-wireless@vger.kernel.org
62Description: Current softblock state. This file is read and write.
63Values: A numeric value.
64 0: inactive
65 The transmitter is (potentially) active.
66 1: active
67 The transmitter is turned off by software.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25be3250f7d6..428676cfa61e 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -133,6 +133,46 @@ Description:
133 The symbolic link points to the PCI device sysfs entry of the 133 The symbolic link points to the PCI device sysfs entry of the
134 Physical Function this device associates with. 134 Physical Function this device associates with.
135 135
136
137What: /sys/bus/pci/slots/...
138Date: April 2005 (possibly older)
139KernelVersion: 2.6.12 (possibly older)
140Contact: linux-pci@vger.kernel.org
141Description:
142 When the appropriate driver is loaded, it will create a
143 directory per claimed physical PCI slot in
144 /sys/bus/pci/slots/. The names of these directories are
145 specific to the driver, which in turn, are specific to the
146 platform, but in general, should match the label on the
147 machine's physical chassis.
148
149 The drivers that can create slot directories include the
150 PCI hotplug drivers, and as of 2.6.27, the pci_slot driver.
151
152 The slot directories contain, at a minimum, a file named
153 'address' which contains the PCI bus:device:function tuple.
154 Other files may appear as well, but are specific to the
155 driver.
156
157What: /sys/bus/pci/slots/.../function[0-7]
158Date: March 2010
159KernelVersion: 2.6.35
160Contact: linux-pci@vger.kernel.org
161Description:
162 If PCI slot directories (as described above) are created,
163 and the physical slot is actually populated with a device,
164 symbolic links in the slot directory pointing to the
165 device's PCI functions are created as well.
166
167What: /sys/bus/pci/devices/.../slot
168Date: March 2010
169KernelVersion: 2.6.35
170Contact: linux-pci@vger.kernel.org
171Description:
172 If PCI slot directories (as described above) are created,
173 a symbolic link pointing to the slot directory will be
174 created as well.
175
136What: /sys/bus/pci/slots/.../module 176What: /sys/bus/pci/slots/.../module
137Date: June 2009 177Date: June 2009
138Contact: linux-pci@vger.kernel.org 178Contact: linux-pci@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index bcebb9eaedce..294aa864a60a 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -14,34 +14,6 @@ Description:
14 The autosuspend delay for newly-created devices is set to 14 The autosuspend delay for newly-created devices is set to
15 the value of the usbcore.autosuspend module parameter. 15 the value of the usbcore.autosuspend module parameter.
16 16
17What: /sys/bus/usb/devices/.../power/level
18Date: March 2007
19KernelVersion: 2.6.21
20Contact: Alan Stern <stern@rowland.harvard.edu>
21Description:
22 Each USB device directory will contain a file named
23 power/level. This file holds a power-level setting for
24 the device, either "on" or "auto".
25
26 "on" means that the device is not allowed to autosuspend,
27 although normal suspends for system sleep will still
28 be honored. "auto" means the device will autosuspend
29 and autoresume in the usual manner, according to the
30 capabilities of its driver.
31
32 During normal use, devices should be left in the "auto"
33 level. The "on" level is meant for administrative uses.
34 If you want to suspend a device immediately but leave it
35 free to wake up in response to I/O requests, you should
36 write "0" to power/autosuspend.
37
38 Device not capable of proper suspend and resume should be
39 left in the "on" level. Although the USB spec requires
40 devices to support suspend/resume, many of them do not.
41 In fact so many don't that by default, the USB core
42 initializes all non-hub devices in the "on" level. Some
43 drivers may change this setting when they are bound.
44
45What: /sys/bus/usb/devices/.../power/persist 17What: /sys/bus/usb/devices/.../power/persist
46Date: May 2007 18Date: May 2007
47KernelVersion: 2.6.23 19KernelVersion: 2.6.23
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
new file mode 100644
index 000000000000..78c7baca3587
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -0,0 +1,20 @@
1What: /sys/class/power/ds2760-battery.*/charge_now
2Date: May 2010
3KernelVersion: 2.6.35
4Contact: Daniel Mack <daniel@caiaq.de>
5Description:
6 This file is writeable and can be used to set the current
7 coloumb counter value inside the battery monitor chip. This
8 is needed for unavoidable corrections of aging batteries.
9 A userspace daemon can monitor the battery charging logic
10 and once the counter drops out of considerable bounds, take
11 appropriate action.
12
13What: /sys/class/power/ds2760-battery.*/charge_full
14Date: May 2010
15KernelVersion: 2.6.35
16Contact: Daniel Mack <daniel@caiaq.de>
17Description:
18 This file is writeable and can be used to set the assumed
19 battery 'full level'. As batteries age, this value has to be
20 amended over time.
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index bf1627b02a03..aba7d989208c 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -43,7 +43,7 @@ Date: September 2008
43Contact: Badari Pulavarty <pbadari@us.ibm.com> 43Contact: Badari Pulavarty <pbadari@us.ibm.com>
44Description: 44Description:
45 The file /sys/devices/system/memory/memoryX/state 45 The file /sys/devices/system/memory/memoryX/state
46 is read-write. When read, it's contents show the 46 is read-write. When read, its contents show the
47 online/offline state of the memory section. When written, 47 online/offline state of the memory section. When written,
48 root can toggle the the online/offline state of a removable 48 root can toggle the the online/offline state of a removable
49 memory section (see removable file description above) 49 memory section (see removable file description above)
diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node
new file mode 100644
index 000000000000..453a210c3ceb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-node
@@ -0,0 +1,7 @@
1What: /sys/devices/system/node/nodeX/compact
2Date: February 2010
3Contact: Mel Gorman <mel@csn.ul.ie>
4Description:
5 When this file is written to, all memory within that node
6 will be compacted. When it completes, memory will be freed
7 into blocks which have as many contiguous pages as possible
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget
new file mode 100644
index 000000000000..34034027b13c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget
@@ -0,0 +1,9 @@
1What: /sys/devices/platform/_UDC_/gadget/suspended
2Date: April 2010
3Contact: Fabien Chouteau <fabien.chouteau@barco.com>
4Description:
5 Show the suspend state of an USB composite gadget.
6 1 -> suspended
7 0 -> resumed
8
9 (_UDC_ is the name of the USB Device Controller driver)
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-picolcd b/Documentation/ABI/testing/sysfs-driver-hid-picolcd
new file mode 100644
index 000000000000..08579e7e1e89
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-picolcd
@@ -0,0 +1,43 @@
1What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode
2Date: March 2010
3Contact: Bruno Prémont <bonbons@linux-vserver.org>
4Description: Make it possible to switch the PicoLCD device between LCD
5 (firmware) and bootloader (flasher) operation modes.
6
7 Reading: returns list of available modes, the active mode being
8 enclosed in brackets ('[' and ']')
9
10 Writing: causes operation mode switch. Permitted values are
11 the non-active mode names listed when read.
12
13 Note: when switching mode the current PicoLCD HID device gets
14 disconnected and reconnects after above delay (see attribute
15 operation_mode_delay for its value).
16
17
18What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode_delay
19Date: April 2010
20Contact: Bruno Prémont <bonbons@linux-vserver.org>
21Description: Delay PicoLCD waits before restarting in new mode when
22 operation_mode has changed.
23
24 Reading/Writing: It is expressed in ms and permitted range is
25 0..30000ms.
26
27
28What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/fb_update_rate
29Date: March 2010
30Contact: Bruno Prémont <bonbons@linux-vserver.org>
31Description: Make it possible to adjust defio refresh rate.
32
33 Reading: returns list of available refresh rates (expressed in Hz),
34 the active refresh rate being enclosed in brackets ('[' and ']')
35
36 Writing: accepts new refresh rate expressed in integer Hz
37 within permitted rates.
38
39 Note: As device can barely do 2 complete refreshes a second
40 it only makes sense to adjust this value if only one or two
41 tiles get changed and it's not appropriate to expect the application
42 to flush it's tiny changes explicitely at higher than default rate.
43
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-prodikeys b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys
new file mode 100644
index 000000000000..05d988c29a83
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys
@@ -0,0 +1,29 @@
1What: /sys/bus/hid/drivers/prodikeys/.../channel
2Date: April 2010
3KernelVersion: 2.6.34
4Contact: Don Prince <dhprince.devel@yahoo.co.uk>
5Description:
6 Allows control (via software) the midi channel to which
7 that the pc-midi keyboard will output.midi data.
8 Range: 0..15
9 Type: Read/write
10What: /sys/bus/hid/drivers/prodikeys/.../sustain
11Date: April 2010
12KernelVersion: 2.6.34
13Contact: Don Prince <dhprince.devel@yahoo.co.uk>
14Description:
15 Allows control (via software) the sustain duration of a
16 note held by the pc-midi driver.
17 0 means sustain mode is disabled.
18 Range: 0..5000 (milliseconds)
19 Type: Read/write
20What: /sys/bus/hid/drivers/prodikeys/.../octave
21Date: April 2010
22KernelVersion: 2.6.34
23Contact: Don Prince <dhprince.devel@yahoo.co.uk>
24Description:
25 Controls the octave shift modifier in the pc-midi driver.
26 The octave can be shifted via software up/down 2 octaves.
27 0 means the no ocatve shift.
28 Range: -2..2 (minus 2 to plus 2)
29 Type: Read/Write
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
new file mode 100644
index 000000000000..88340a23ce91
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
@@ -0,0 +1,111 @@
1What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_dpi
2Date: March 2010
3Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
4Description: It is possible to switch the dpi setting of the mouse with the
5 press of a button.
6 When read, this file returns the raw number of the actual dpi
7 setting reported by the mouse. This number has to be further
8 processed to receive the real dpi value.
9
10 VALUE DPI
11 1 800
12 2 1200
13 3 1600
14 4 2000
15 5 2400
16 6 3200
17
18 This file is readonly.
19
20What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_profile
21Date: March 2010
22Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
23Description: When read, this file returns the number of the actual profile.
24 This file is readonly.
25
26What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/firmware_version
27Date: March 2010
28Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
29Description: When read, this file returns the raw integer version number of the
30 firmware reported by the mouse. Using the integer value eases
31 further usage in other programs. To receive the real version
32 number the decimal point has to be shifted 2 positions to the
33 left. E.g. a returned value of 138 means 1.38
34 This file is readonly.
35
36What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/kone_driver_version
37Date: March 2010
38Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
39Description: When read, this file returns the driver version.
40 The format of the string is "v<major>.<minor>.<patchlevel>".
41 This attribute is used by the userland tools to find the sysfs-
42 paths of installed kone-mice and determine the capabilites of
43 the driver. Versions of this driver for old kernels replace
44 usbhid instead of generic-usb. The way to scan for this file
45 has been chosen to provide a consistent way for all supported
46 kernel versions.
47 This file is readonly.
48
49What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile[1-5]
50Date: March 2010
51Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
52Description: The mouse can store 5 profiles which can be switched by the
53 press of a button. A profile holds informations like button
54 mappings, sensitivity, the colors of the 5 leds and light
55 effects.
56 When read, these files return the respective profile. The
57 returned data is 975 bytes in size.
58 When written, this file lets one write the respective profile
59 data back to the mouse. The data has to be 975 bytes long.
60 The mouse will reject invalid data, whereas the profile number
61 stored in the profile doesn't need to fit the number of the
62 store.
63
64What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/settings
65Date: March 2010
66Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
67Description: When read, this file returns the settings stored in the mouse.
68 The size of the data is 36 bytes and holds information like the
69 startup_profile, tcu state and calibration_data.
70 When written, this file lets write settings back to the mouse.
71 The data has to be 36 bytes long. The mouse will reject invalid
72 data.
73
74What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/startup_profile
75Date: March 2010
76Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
77Description: The integer value of this attribute ranges from 1 to 5.
78 When read, this attribute returns the number of the profile
79 that's active when the mouse is powered on.
80 When written, this file sets the number of the startup profile
81 and the mouse activates this profile immediately.
82
83What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/tcu
84Date: March 2010
85Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
86Description: The mouse has a "Tracking Control Unit" which lets the user
87 calibrate the laser power to fit the mousepad surface.
88 When read, this file returns the current state of the TCU,
89 where 0 means off and 1 means on.
90 Writing 0 in this file will switch the TCU off.
91 Writing 1 in this file will start the calibration which takes
92 around 6 seconds to complete and activates the TCU.
93
94What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/weight
95Date: March 2010
96Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
97Description: The mouse can be equipped with one of four supplied weights
98 ranging from 5 to 20 grams which are recognized by the mouse
99 and its value can be read out. When read, this file returns the
100 raw value returned by the mouse which eases further processing
101 in other software.
102 The values map to the weights as follows:
103
104 VALUE WEIGHT
105 0 none
106 1 5g
107 2 10g
108 3 15g
109 4 20g
110
111 This file is readonly.
diff --git a/Documentation/ABI/testing/sysfs-firmware-sfi b/Documentation/ABI/testing/sysfs-firmware-sfi
new file mode 100644
index 000000000000..4be7d44aeacf
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-sfi
@@ -0,0 +1,15 @@
1What: /sys/firmware/sfi/tables/
2Date: May 2010
3Contact: Len Brown <lenb@kernel.org>
4Description:
5 SFI defines a number of small static memory tables
6 so the kernel can get platform information from firmware.
7
8 The tables are defined in the latest SFI specification:
9 http://simplefirmware.org/documentation
10
11 While the tables are used by the kernel, user-space
12 can observe them this way:
13
14 # cd /sys/firmware/sfi/tables
15 # cat $TABLENAME > $TABLENAME.bin
diff --git a/Documentation/ABI/testing/sysfs-wacom b/Documentation/ABI/testing/sysfs-wacom
new file mode 100644
index 000000000000..1517976e25c4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-wacom
@@ -0,0 +1,10 @@
1What: /sys/class/hidraw/hidraw*/device/speed
2Date: April 2010
3Kernel Version: 2.6.35
4Contact: linux-bluetooth@vger.kernel.org
5Description:
6 The /sys/class/hidraw/hidraw*/device/speed file controls
7 reporting speed of wacom bluetooth tablet. Reading from
8 this file returns 1 if tablet reports in high speed mode
9 or 0 otherwise. Writing to this file one of these values
10 switches reporting speed.
diff --git a/Documentation/Changes b/Documentation/Changes
index f08b313cd235..eca9f6e6fbe6 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -49,7 +49,7 @@ o oprofile 0.9 # oprofiled --version
49o udev 081 # udevinfo -V 49o udev 081 # udevinfo -V
50o grub 0.93 # grub --version 50o grub 0.93 # grub --version
51o mcelog 0.6 51o mcelog 0.6
52o iptables 1.4.1 # iptables -V 52o iptables 1.4.2 # iptables -V
53 53
54 54
55Kernel compilation 55Kernel compilation
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index 52618ab069ad..98ce51796f71 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -639,6 +639,36 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as
639they are entirely deprecated. Some ports already do not provide these 639they are entirely deprecated. Some ports already do not provide these
640as it is impossible to correctly support them. 640as it is impossible to correctly support them.
641 641
642 Handling Errors
643
644DMA address space is limited on some architectures and an allocation
645failure can be determined by:
646
647- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0
648
649- checking the returned dma_addr_t of dma_map_single and dma_map_page
650 by using dma_mapping_error():
651
652 dma_addr_t dma_handle;
653
654 dma_handle = dma_map_single(dev, addr, size, direction);
655 if (dma_mapping_error(dev, dma_handle)) {
656 /*
657 * reduce current DMA mapping usage,
658 * delay and try again later or
659 * reset driver.
660 */
661 }
662
663Networking drivers must call dev_kfree_skb to free the socket buffer
664and return NETDEV_TX_OK if the DMA mapping fails on the transmit hook
665(ndo_start_xmit). This means that the socket buffer is just dropped in
666the failure case.
667
668SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping
669fails in the queuecommand hook. This means that the SCSI subsystem
670passes the command to the driver again later.
671
642 Optimizing Unmap State Space Consumption 672 Optimizing Unmap State Space Consumption
643 673
644On many platforms, dma_unmap_{single,page}() is simply a nop. 674On many platforms, dma_unmap_{single,page}() is simply a nop.
@@ -703,46 +733,29 @@ to "Closing".
703 733
7041) Struct scatterlist requirements. 7341) Struct scatterlist requirements.
705 735
706 Struct scatterlist must contain, at a minimum, the following 736 Don't invent the architecture specific struct scatterlist; just use
707 members: 737 <asm-generic/scatterlist.h>. You need to enable
738 CONFIG_NEED_SG_DMA_LENGTH if the architecture supports IOMMUs
739 (including software IOMMU).
708 740
709 struct page *page; 7412) ARCH_KMALLOC_MINALIGN
710 unsigned int offset;
711 unsigned int length;
712 742
713 The base address is specified by a "page+offset" pair. 743 Architectures must ensure that kmalloc'ed buffer is
744 DMA-safe. Drivers and subsystems depend on it. If an architecture
745 isn't fully DMA-coherent (i.e. hardware doesn't ensure that data in
746 the CPU cache is identical to data in main memory),
747 ARCH_KMALLOC_MINALIGN must be set so that the memory allocator
748 makes sure that kmalloc'ed buffer doesn't share a cache line with
749 the others. See arch/arm/include/asm/cache.h as an example.
714 750
715 Previous versions of struct scatterlist contained a "void *address" 751 Note that ARCH_KMALLOC_MINALIGN is about DMA memory alignment
716 field that was sometimes used instead of page+offset. As of Linux 752 constraints. You don't need to worry about the architecture data
717 2.5., page+offset is always used, and the "address" field has been 753 alignment constraints (e.g. the alignment constraints about 64-bit
718 deleted. 754 objects).
719
7202) More to come...
721
722 Handling Errors
723
724DMA address space is limited on some architectures and an allocation
725failure can be determined by:
726
727- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0
728
729- checking the returned dma_addr_t of dma_map_single and dma_map_page
730 by using dma_mapping_error():
731
732 dma_addr_t dma_handle;
733
734 dma_handle = dma_map_single(dev, addr, size, direction);
735 if (dma_mapping_error(dev, dma_handle)) {
736 /*
737 * reduce current DMA mapping usage,
738 * delay and try again later or
739 * reset driver.
740 */
741 }
742 755
743 Closing 756 Closing
744 757
745This document, and the API itself, would not be in it's current 758This document, and the API itself, would not be in its current
746form without the feedback and suggestions from numerous individuals. 759form without the feedback and suggestions from numerous individuals.
747We would like to specifically mention, in no particular order, the 760We would like to specifically mention, in no particular order, the
748following people: 761following people:
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index 5cff41a5fa7c..55f12ac37acd 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -4,7 +4,7 @@
4 4
5<book id="kgdbOnLinux"> 5<book id="kgdbOnLinux">
6 <bookinfo> 6 <bookinfo>
7 <title>Using kgdb and the kgdb Internals</title> 7 <title>Using kgdb, kdb and the kernel debugger internals</title>
8 8
9 <authorgroup> 9 <authorgroup>
10 <author> 10 <author>
@@ -17,33 +17,8 @@
17 </affiliation> 17 </affiliation>
18 </author> 18 </author>
19 </authorgroup> 19 </authorgroup>
20
21 <authorgroup>
22 <author>
23 <firstname>Tom</firstname>
24 <surname>Rini</surname>
25 <affiliation>
26 <address>
27 <email>trini@kernel.crashing.org</email>
28 </address>
29 </affiliation>
30 </author>
31 </authorgroup>
32
33 <authorgroup>
34 <author>
35 <firstname>Amit S.</firstname>
36 <surname>Kale</surname>
37 <affiliation>
38 <address>
39 <email>amitkale@linsyssoft.com</email>
40 </address>
41 </affiliation>
42 </author>
43 </authorgroup>
44
45 <copyright> 20 <copyright>
46 <year>2008</year> 21 <year>2008,2010</year>
47 <holder>Wind River Systems, Inc.</holder> 22 <holder>Wind River Systems, Inc.</holder>
48 </copyright> 23 </copyright>
49 <copyright> 24 <copyright>
@@ -69,41 +44,76 @@
69 <chapter id="Introduction"> 44 <chapter id="Introduction">
70 <title>Introduction</title> 45 <title>Introduction</title>
71 <para> 46 <para>
72 kgdb is a source level debugger for linux kernel. It is used along 47 The kernel has two different debugger front ends (kdb and kgdb)
73 with gdb to debug a linux kernel. The expectation is that gdb can 48 which interface to the debug core. It is possible to use either
74 be used to "break in" to the kernel to inspect memory, variables 49 of the debugger front ends and dynamically transition between them
75 and look through call stack information similar to what an 50 if you configure the kernel properly at compile and runtime.
76 application developer would use gdb for. It is possible to place 51 </para>
77 breakpoints in kernel code and perform some limited execution 52 <para>
78 stepping. 53 Kdb is simplistic shell-style interface which you can use on a
54 system console with a keyboard or serial console. You can use it
55 to inspect memory, registers, process lists, dmesg, and even set
56 breakpoints to stop in a certain location. Kdb is not a source
57 level debugger, although you can set breakpoints and execute some
58 basic kernel run control. Kdb is mainly aimed at doing some
59 analysis to aid in development or diagnosing kernel problems. You
60 can access some symbols by name in kernel built-ins or in kernel
61 modules if the code was built
62 with <symbol>CONFIG_KALLSYMS</symbol>.
63 </para>
64 <para>
65 Kgdb is intended to be used as a source level debugger for the
66 Linux kernel. It is used along with gdb to debug a Linux kernel.
67 The expectation is that gdb can be used to "break in" to the
68 kernel to inspect memory, variables and look through call stack
69 information similar to the way an application developer would use
70 gdb to debug an application. It is possible to place breakpoints
71 in kernel code and perform some limited execution stepping.
79 </para> 72 </para>
80 <para> 73 <para>
81 Two machines are required for using kgdb. One of these machines is a 74 Two machines are required for using kgdb. One of these machines is
82 development machine and the other is a test machine. The kernel 75 a development machine and the other is the target machine. The
83 to be debugged runs on the test machine. The development machine 76 kernel to be debugged runs on the target machine. The development
84 runs an instance of gdb against the vmlinux file which contains 77 machine runs an instance of gdb against the vmlinux file which
85 the symbols (not boot image such as bzImage, zImage, uImage...). 78 contains the symbols (not boot image such as bzImage, zImage,
86 In gdb the developer specifies the connection parameters and 79 uImage...). In gdb the developer specifies the connection
87 connects to kgdb. The type of connection a developer makes with 80 parameters and connects to kgdb. The type of connection a
88 gdb depends on the availability of kgdb I/O modules compiled as 81 developer makes with gdb depends on the availability of kgdb I/O
89 builtin's or kernel modules in the test machine's kernel. 82 modules compiled as built-ins or loadable kernel modules in the test
83 machine's kernel.
90 </para> 84 </para>
91 </chapter> 85 </chapter>
92 <chapter id="CompilingAKernel"> 86 <chapter id="CompilingAKernel">
93 <title>Compiling a kernel</title> 87 <title>Compiling a kernel</title>
88 <para>
89 <itemizedlist>
90 <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem>
91 <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem>
92 </itemizedlist>
93 </para>
94 <sect1 id="CompileKGDB">
95 <title>Kernel config options for kgdb</title>
94 <para> 96 <para>
95 To enable <symbol>CONFIG_KGDB</symbol> you should first turn on 97 To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
96 "Prompt for development and/or incomplete code/drivers" 98 "Prompt for development and/or incomplete code/drivers"
97 (CONFIG_EXPERIMENTAL) in "General setup", then under the 99 (CONFIG_EXPERIMENTAL) in "General setup", then under the
98 "Kernel debugging" select "KGDB: kernel debugging with remote gdb". 100 "Kernel debugging" select "KGDB: kernel debugger".
101 </para>
102 <para>
103 While it is not a hard requirement that you have symbols in your
104 vmlinux file, gdb tends not to be very useful without the symbolic
105 data, so you will want to turn
106 on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the
107 kernel with debug info" in the config menu.
99 </para> 108 </para>
100 <para> 109 <para>
101 It is advised, but not required that you turn on the 110 It is advised, but not required that you turn on the
102 CONFIG_FRAME_POINTER kernel option. This option inserts code to 111 <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the
103 into the compiled executable which saves the frame information in 112 kernel with frame pointers" in the config menu. This option
104 registers or on the stack at different points which will allow a 113 inserts code to into the compiled executable which saves the frame
105 debugger such as gdb to more accurately construct stack back traces 114 information in registers or on the stack at different points which
106 while debugging the kernel. 115 allows a debugger such as gdb to more accurately construct
116 stack back traces while debugging the kernel.
107 </para> 117 </para>
108 <para> 118 <para>
109 If the architecture that you are using supports the kernel option 119 If the architecture that you are using supports the kernel option
@@ -116,38 +126,160 @@
116 this option. 126 this option.
117 </para> 127 </para>
118 <para> 128 <para>
119 Next you should choose one of more I/O drivers to interconnect debugging 129 Next you should choose one of more I/O drivers to interconnect
120 host and debugged target. Early boot debugging requires a KGDB 130 debugging host and debugged target. Early boot debugging requires
121 I/O driver that supports early debugging and the driver must be 131 a KGDB I/O driver that supports early debugging and the driver
122 built into the kernel directly. Kgdb I/O driver configuration 132 must be built into the kernel directly. Kgdb I/O driver
123 takes place via kernel or module parameters, see following 133 configuration takes place via kernel or module parameters which
124 chapter. 134 you can learn more about in the in the section that describes the
135 parameter "kgdboc".
125 </para> 136 </para>
126 <para> 137 <para>Here is an example set of .config symbols to enable or
127 The kgdb test compile options are described in the kgdb test suite chapter. 138 disable for kgdb:
139 <itemizedlist>
140 <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
141 <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
142 <listitem><para>CONFIG_KGDB=y</para></listitem>
143 <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
144 </itemizedlist>
128 </para> 145 </para>
129 146 </sect1>
147 <sect1 id="CompileKDB">
148 <title>Kernel config options for kdb</title>
149 <para>Kdb is quite a bit more complex than the simple gdbstub
150 sitting on top of the kernel's debug core. Kdb must implement a
151 shell, and also adds some helper functions in other parts of the
152 kernel, responsible for printing out interesting data such as what
153 you would see if you ran "lsmod", or "ps". In order to build kdb
154 into the kernel you follow the same steps as you would for kgdb.
155 </para>
156 <para>The main config option for kdb
157 is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB:
158 include kdb frontend for kgdb" in the config menu. In theory you
159 would have already also selected an I/O driver such as the
160 CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
161 serial port, when you were configuring kgdb.
162 </para>
163 <para>If you want to use a PS/2-style keyboard with kdb, you would
164 select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
165 input device" in the config menu. The CONFIG_KDB_KEYBOARD option
166 is not used for anything in the gdb interface to kgdb. The
167 CONFIG_KDB_KEYBOARD option only works with kdb.
168 </para>
169 <para>Here is an example set of .config symbols to enable/disable kdb:
170 <itemizedlist>
171 <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
172 <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
173 <listitem><para>CONFIG_KGDB=y</para></listitem>
174 <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
175 <listitem><para>CONFIG_KGDB_KDB=y</para></listitem>
176 <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem>
177 </itemizedlist>
178 </para>
179 </sect1>
130 </chapter> 180 </chapter>
131 <chapter id="EnableKGDB"> 181 <chapter id="kgdbKernelArgs">
132 <title>Enable kgdb for debugging</title> 182 <title>Kernel Debugger Boot Arguments</title>
133 <para> 183 <para>This section describes the various runtime kernel
134 In order to use kgdb you must activate it by passing configuration 184 parameters that affect the configuration of the kernel debugger.
135 information to one of the kgdb I/O drivers. If you do not pass any 185 The following chapter covers using kdb and kgdb as well as
136 configuration information kgdb will not do anything at all. Kgdb 186 provides some examples of the configuration parameters.</para>
137 will only actively hook up to the kernel trap hooks if a kgdb I/O 187 <sect1 id="kgdboc">
138 driver is loaded and configured. If you unconfigure a kgdb I/O 188 <title>Kernel parameter: kgdboc</title>
139 driver, kgdb will unregister all the kernel hook points. 189 <para>The kgdboc driver was originally an abbreviation meant to
190 stand for "kgdb over console". Today it is the primary mechanism
191 to configure how to communicate from gdb to kgdb as well as the
192 devices you want to use to interact with the kdb shell.
193 </para>
194 <para>For kgdb/gdb, kgdboc is designed to work with a single serial
195 port. It is intended to cover the circumstance where you want to
196 use a serial console as your primary console as well as using it to
197 perform kernel debugging. It is also possible to use kgdb on a
198 serial port which is not designated as a system console. Kgdboc
199 may be configured as a kernel built-in or a kernel loadable module.
200 You can only make use of <constant>kgdbwait</constant> and early
201 debugging if you build kgdboc into the kernel as a built-in.
140 </para> 202 </para>
203 <sect2 id="kgdbocArgs">
204 <title>kgdboc arguments</title>
205 <para>Usage: <constant>kgdboc=[kbd][[,]serial_device][,baud]</constant></para>
206 <sect3 id="kgdbocArgs1">
207 <title>Using loadable module or built-in</title>
141 <para> 208 <para>
142 All drivers can be reconfigured at run time, if 209 <orderedlist>
143 <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> 210 <listitem><para>As a kernel built-in:</para>
144 are enabled, by echo'ing a new config string to 211 <para>Use the kernel boot argument: <constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para></listitem>
145 <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>. 212 <listitem>
146 The driver can be unconfigured by passing an empty string. You cannot 213 <para>As a kernel loadable module:</para>
147 change the configuration while the debugger is attached. Make sure 214 <para>Use the command: <constant>modprobe kgdboc kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
148 to detach the debugger with the <constant>detach</constant> command 215 <para>Here are two examples of how you might formate the kgdboc
149 prior to trying unconfigure a kgdb I/O driver. 216 string. The first is for an x86 target using the first serial port.
217 The second example is for the ARM Versatile AB using the second
218 serial port.
219 <orderedlist>
220 <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
221 <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem>
222 </orderedlist>
150 </para> 223 </para>
224 </listitem>
225 </orderedlist></para>
226 </sect3>
227 <sect3 id="kgdbocArgs2">
228 <title>Configure kgdboc at runtime with sysfs</title>
229 <para>At run time you can enable or disable kgdboc by echoing a
230 parameters into the sysfs. Here are two examples:</para>
231 <orderedlist>
232 <listitem><para>Enable kgdboc on ttyS0</para>
233 <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
234 <listitem><para>Disable kgdboc</para>
235 <para><constant>echo "" &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
236 </orderedlist>
237 <para>NOTE: You do not need to specify the baud if you are
238 configuring the console on tty which is already configured or
239 open.</para>
240 </sect3>
241 <sect3 id="kgdbocArgs3">
242 <title>More examples</title>
243 <para>You can configure kgdboc to use the keyboard, and or a serial device
244 depending on if you are using kdb and or kgdb, in one of the
245 following scenarios.
246 <orderedlist>
247 <listitem><para>kdb and kgdb over only a serial port</para>
248 <para><constant>kgdboc=&lt;serial_device&gt;[,baud]</constant></para>
249 <para>Example: <constant>kgdboc=ttyS0,115200</constant></para>
250 </listitem>
251 <listitem><para>kdb and kgdb with keyboard and a serial port</para>
252 <para><constant>kgdboc=kbd,&lt;serial_device&gt;[,baud]</constant></para>
253 <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para>
254 </listitem>
255 <listitem><para>kdb with a keyboard</para>
256 <para><constant>kgdboc=kbd</constant></para>
257 </listitem>
258 </orderedlist>
259 </para>
260 </sect3>
261 <para>NOTE: Kgdboc does not support interrupting the target via the
262 gdb remote protocol. You must manually send a sysrq-g unless you
263 have a proxy that splits console output to a terminal program.
264 A console proxy has a separate TCP port for the debugger and a separate
265 TCP port for the "human" console. The proxy can take care of sending
266 the sysrq-g for you.
267 </para>
268 <para>When using kgdboc with no debugger proxy, you can end up
269 connecting the debugger at one of two entry points. If an
270 exception occurs after you have loaded kgdboc, a message should
271 print on the console stating it is waiting for the debugger. In
272 this case you disconnect your terminal program and then connect the
273 debugger in its place. If you want to interrupt the target system
274 and forcibly enter a debug session you have to issue a Sysrq
275 sequence and then type the letter <constant>g</constant>. Then
276 you disconnect the terminal session and connect gdb. Your options
277 if you don't like this are to hack gdb to send the sysrq-g for you
278 as well as on the initial connect, or to use a debugger proxy that
279 allows an unmodified gdb to do the debugging.
280 </para>
281 </sect2>
282 </sect1>
151 <sect1 id="kgdbwait"> 283 <sect1 id="kgdbwait">
152 <title>Kernel parameter: kgdbwait</title> 284 <title>Kernel parameter: kgdbwait</title>
153 <para> 285 <para>
@@ -162,103 +294,204 @@
162 </para> 294 </para>
163 <para> 295 <para>
164 The kernel will stop and wait as early as the I/O driver and 296 The kernel will stop and wait as early as the I/O driver and
165 architecture will allow when you use this option. If you build the 297 architecture allows when you use this option. If you build the
166 kgdb I/O driver as a kernel module kgdbwait will not do anything. 298 kgdb I/O driver as a loadable kernel module kgdbwait will not do
299 anything.
167 </para> 300 </para>
168 </sect1> 301 </sect1>
169 <sect1 id="kgdboc"> 302 <sect1 id="kgdbcon">
170 <title>Kernel parameter: kgdboc</title> 303 <title>Kernel parameter: kgdbcon</title>
171 <para> 304 <para> The kgdbcon feature allows you to see printk() messages
172 The kgdboc driver was originally an abbreviation meant to stand for 305 inside gdb while gdb is connected to the kernel. Kdb does not make
173 "kgdb over console". Kgdboc is designed to work with a single 306 use of the kgdbcon feature.
174 serial port. It was meant to cover the circumstance 307 </para>
175 where you wanted to use a serial console as your primary console as 308 <para>Kgdb supports using the gdb serial protocol to send console
176 well as using it to perform kernel debugging. Of course you can 309 messages to the debugger when the debugger is connected and running.
177 also use kgdboc without assigning a console to the same port. 310 There are two ways to activate this feature.
311 <orderedlist>
312 <listitem><para>Activate with the kernel command line option:</para>
313 <para><constant>kgdbcon</constant></para>
314 </listitem>
315 <listitem><para>Use sysfs before configuring an I/O driver</para>
316 <para>
317 <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant>
318 </para>
319 <para>
320 NOTE: If you do this after you configure the kgdb I/O driver, the
321 setting will not take effect until the next point the I/O is
322 reconfigured.
323 </para>
324 </listitem>
325 </orderedlist>
326 <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
327 active system console. An example incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant>
328 </para>
329 <para>It is possible to use this option with kgdboc on a tty that is not a system console.
330 </para>
178 </para> 331 </para>
179 <sect2 id="UsingKgdboc"> 332 </sect1>
180 <title>Using kgdboc</title> 333 </chapter>
181 <para> 334 <chapter id="usingKDB">
182 You can configure kgdboc via sysfs or a module or kernel boot line 335 <title>Using kdb</title>
183 parameter depending on if you build with CONFIG_KGDBOC as a module
184 or built-in.
185 <orderedlist>
186 <listitem><para>From the module load or build-in</para>
187 <para><constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
188 <para> 336 <para>
189 The example here would be if your console port was typically ttyS0, you would use something like <constant>kgdboc=ttyS0,115200</constant> or on the ARM Versatile AB you would likely use <constant>kgdboc=ttyAMA0,115200</constant> 337 </para>
338 <sect1 id="quickKDBserial">
339 <title>Quick start for kdb on a serial port</title>
340 <para>This is a quick example of how to use kdb.</para>
341 <para><orderedlist>
342 <listitem><para>Boot kernel with arguments:
343 <itemizedlist>
344 <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem>
345 </itemizedlist></para>
346 <para>OR</para>
347 <para>Configure kgdboc after the kernel booted; assuming you are using a serial port console:
348 <itemizedlist>
349 <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
350 </itemizedlist>
190 </para> 351 </para>
191 </listitem> 352 </listitem>
192 <listitem><para>From sysfs</para> 353 <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
193 <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para> 354 <itemizedlist>
355 <listitem><para>When logged in as root or with a super user session you can run:</para>
356 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
357 <listitem><para>Example using minicom 2.2</para>
358 <para>Press: <constant>Control-a</constant></para>
359 <para>Press: <constant>f</constant></para>
360 <para>Press: <constant>g</constant></para>
194 </listitem> 361 </listitem>
195 </orderedlist> 362 <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
196 </para> 363 <para>Press: <constant>Control-]</constant></para>
197 <para> 364 <para>Type in:<constant>send break</constant></para>
198 NOTE: Kgdboc does not support interrupting the target via the 365 <para>Press: <constant>Enter</constant></para>
199 gdb remote protocol. You must manually send a sysrq-g unless you 366 <para>Press: <constant>g</constant></para>
200 have a proxy that splits console output to a terminal problem and 367 </listitem>
201 has a separate port for the debugger to connect to that sends the 368 </itemizedlist>
202 sysrq-g for you. 369 </listitem>
370 <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para>
371 <para>Some useful commands in kdb include:
372 <itemizedlist>
373 <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem>
374 <listitem><para>ps -- Displays only the active processes</para></listitem>
375 <listitem><para>ps A -- Shows all the processes</para></listitem>
376 <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem>
377 <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem>
378 <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem>
379 <listitem><para>go -- Continue the system</para></listitem>
380 </itemizedlist>
203 </para> 381 </para>
204 <para>When using kgdboc with no debugger proxy, you can end up 382 </listitem>
205 connecting the debugger for one of two entry points. If an 383 <listitem>
206 exception occurs after you have loaded kgdboc a message should print 384 <para>When you are done using kdb you need to consider rebooting the
207 on the console stating it is waiting for the debugger. In case you 385 system or using the "go" command to resuming normal kernel
208 disconnect your terminal program and then connect the debugger in 386 execution. If you have paused the kernel for a lengthy period of
209 its place. If you want to interrupt the target system and forcibly 387 time, applications that rely on timely networking or anything to do
210 enter a debug session you have to issue a Sysrq sequence and then 388 with real wall clock time could be adversely affected, so you
211 type the letter <constant>g</constant>. Then you disconnect the 389 should take this into consideration when using the kernel
212 terminal session and connect gdb. Your options if you don't like 390 debugger.</para>
213 this are to hack gdb to send the sysrq-g for you as well as on the 391 </listitem>
214 initial connect, or to use a debugger proxy that allows an 392 </orderedlist></para>
215 unmodified gdb to do the debugging. 393 </sect1>
394 <sect1 id="quickKDBkeyboard">
395 <title>Quick start for kdb using a keyboard connected console</title>
396 <para>This is a quick example of how to use kdb with a keyboard.</para>
397 <para><orderedlist>
398 <listitem><para>Boot kernel with arguments:
399 <itemizedlist>
400 <listitem><para><constant>kgdboc=kbd</constant></para></listitem>
401 </itemizedlist></para>
402 <para>OR</para>
403 <para>Configure kgdboc after the kernel booted:
404 <itemizedlist>
405 <listitem><para><constant>echo kbd &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
406 </itemizedlist>
216 </para> 407 </para>
217 </sect2> 408 </listitem>
409 <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
410 <itemizedlist>
411 <listitem><para>When logged in as root or with a super user session you can run:</para>
412 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
413 <listitem><para>Example using a laptop keyboard</para>
414 <para>Press and hold down: <constant>Alt</constant></para>
415 <para>Press and hold down: <constant>Fn</constant></para>
416 <para>Press and release the key with the label: <constant>SysRq</constant></para>
417 <para>Release: <constant>Fn</constant></para>
418 <para>Press and release: <constant>g</constant></para>
419 <para>Release: <constant>Alt</constant></para>
420 </listitem>
421 <listitem><para>Example using a PS/2 101-key keyboard</para>
422 <para>Press and hold down: <constant>Alt</constant></para>
423 <para>Press and release the key with the label: <constant>SysRq</constant></para>
424 <para>Press and release: <constant>g</constant></para>
425 <para>Release: <constant>Alt</constant></para>
426 </listitem>
427 </itemizedlist>
428 </listitem>
429 <listitem>
430 <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para>
431 </listitem>
432 </orderedlist></para>
218 </sect1> 433 </sect1>
219 <sect1 id="kgdbcon"> 434 </chapter>
220 <title>Kernel parameter: kgdbcon</title> 435 <chapter id="EnableKGDB">
221 <para> 436 <title>Using kgdb / gdb</title>
222 Kgdb supports using the gdb serial protocol to send console messages 437 <para>In order to use kgdb you must activate it by passing
223 to the debugger when the debugger is connected and running. There 438 configuration information to one of the kgdb I/O drivers. If you
224 are two ways to activate this feature. 439 do not pass any configuration information kgdb will not do anything
440 at all. Kgdb will only actively hook up to the kernel trap hooks
441 if a kgdb I/O driver is loaded and configured. If you unconfigure
442 a kgdb I/O driver, kgdb will unregister all the kernel hook points.
443 </para>
444 <para> All kgdb I/O drivers can be reconfigured at run time, if
445 <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
446 are enabled, by echo'ing a new config string to
447 <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>.
448 The driver can be unconfigured by passing an empty string. You cannot
449 change the configuration while the debugger is attached. Make sure
450 to detach the debugger with the <constant>detach</constant> command
451 prior to trying to unconfigure a kgdb I/O driver.
452 </para>
453 <sect1 id="ConnectingGDB">
454 <title>Connecting with gdb to a serial port</title>
225 <orderedlist> 455 <orderedlist>
226 <listitem><para>Activate with the kernel command line option:</para> 456 <listitem><para>Configure kgdboc</para>
227 <para><constant>kgdbcon</constant></para> 457 <para>Boot kernel with arguments:
458 <itemizedlist>
459 <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
460 </itemizedlist></para>
461 <para>OR</para>
462 <para>Configure kgdboc after the kernel booted:
463 <itemizedlist>
464 <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
465 </itemizedlist></para>
228 </listitem> 466 </listitem>
229 <listitem><para>Use sysfs before configuring an io driver</para> 467 <listitem>
230 <para> 468 <para>Stop kernel execution (break into the debugger)</para>
231 <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant> 469 <para>In order to connect to gdb via kgdboc, the kernel must
232 </para> 470 first be stopped. There are several ways to stop the kernel which
233 <para> 471 include using kgdbwait as a boot argument, via a sysrq-g, or running
234 NOTE: If you do this after you configure the kgdb I/O driver, the 472 the kernel until it takes an exception where it waits for the
235 setting will not take effect until the next point the I/O is 473 debugger to attach.
236 reconfigured. 474 <itemizedlist>
237 </para> 475 <listitem><para>When logged in as root or with a super user session you can run:</para>
476 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
477 <listitem><para>Example using minicom 2.2</para>
478 <para>Press: <constant>Control-a</constant></para>
479 <para>Press: <constant>f</constant></para>
480 <para>Press: <constant>g</constant></para>
238 </listitem> 481 </listitem>
239 </orderedlist> 482 <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
240 </para> 483 <para>Press: <constant>Control-]</constant></para>
241 <para> 484 <para>Type in:<constant>send break</constant></para>
242 IMPORTANT NOTE: Using this option with kgdb over the console 485 <para>Press: <constant>Enter</constant></para>
243 (kgdboc) is not supported. 486 <para>Press: <constant>g</constant></para>
487 </listitem>
488 </itemizedlist>
244 </para> 489 </para>
245 </sect1> 490 </listitem>
246 </chapter> 491 <listitem>
247 <chapter id="ConnectingGDB"> 492 <para>Connect from from gdb</para>
248 <title>Connecting gdb</title>
249 <para>
250 If you are using kgdboc, you need to have used kgdbwait as a boot
251 argument, issued a sysrq-g, or the system you are going to debug
252 has already taken an exception and is waiting for the debugger to
253 attach before you can connect gdb.
254 </para>
255 <para>
256 If you are not using different kgdb I/O driver other than kgdboc,
257 you should be able to connect and the target will automatically
258 respond.
259 </para>
260 <para> 493 <para>
261 Example (using a serial port): 494 Example (using a directly connected port):
262 </para> 495 </para>
263 <programlisting> 496 <programlisting>
264 % gdb ./vmlinux 497 % gdb ./vmlinux
@@ -266,7 +499,7 @@
266 (gdb) target remote /dev/ttyS0 499 (gdb) target remote /dev/ttyS0
267 </programlisting> 500 </programlisting>
268 <para> 501 <para>
269 Example (kgdb to a terminal server on tcp port 2012): 502 Example (kgdb to a terminal server on TCP port 2012):
270 </para> 503 </para>
271 <programlisting> 504 <programlisting>
272 % gdb ./vmlinux 505 % gdb ./vmlinux
@@ -283,6 +516,83 @@
283 communications. You do this prior to issuing the <constant>target 516 communications. You do this prior to issuing the <constant>target
284 remote</constant> command by typing in: <constant>set debug remote 1</constant> 517 remote</constant> command by typing in: <constant>set debug remote 1</constant>
285 </para> 518 </para>
519 </listitem>
520 </orderedlist>
521 <para>Remember if you continue in gdb, and need to "break in" again,
522 you need to issue an other sysrq-g. It is easy to create a simple
523 entry point by putting a breakpoint at <constant>sys_sync</constant>
524 and then you can run "sync" from a shell or script to break into the
525 debugger.</para>
526 </sect1>
527 </chapter>
528 <chapter id="switchKdbKgdb">
529 <title>kgdb and kdb interoperability</title>
530 <para>It is possible to transition between kdb and kgdb dynamically.
531 The debug core will remember which you used the last time and
532 automatically start in the same mode.</para>
533 <sect1>
534 <title>Switching between kdb and kgdb</title>
535 <sect2>
536 <title>Switching from kgdb to kdb</title>
537 <para>
538 There are two ways to switch from kgdb to kdb: you can use gdb to
539 issue a maintenance packet, or you can blindly type the command $3#33.
540 Whenever kernel debugger stops in kgdb mode it will print the
541 message <constant>KGDB or $3#33 for KDB</constant>. It is important
542 to note that you have to type the sequence correctly in one pass.
543 You cannot type a backspace or delete because kgdb will interpret
544 that as part of the debug stream.
545 <orderedlist>
546 <listitem><para>Change from kgdb to kdb by blindly typing:</para>
547 <para><constant>$3#33</constant></para></listitem>
548 <listitem><para>Change from kgdb to kdb with gdb</para>
549 <para><constant>maintenance packet 3</constant></para>
550 <para>NOTE: Now you must kill gdb. Typically you press control-z and
551 issue the command: kill -9 %</para></listitem>
552 </orderedlist>
553 </para>
554 </sect2>
555 <sect2>
556 <title>Change from kdb to kgdb</title>
557 <para>There are two ways you can change from kdb to kgdb. You can
558 manually enter kgdb mode by issuing the kgdb command from the kdb
559 shell prompt, or you can connect gdb while the kdb shell prompt is
560 active. The kdb shell looks for the typical first commands that gdb
561 would issue with the gdb remote protocol and if it sees one of those
562 commands it automatically changes into kgdb mode.</para>
563 <orderedlist>
564 <listitem><para>From kdb issue the command:</para>
565 <para><constant>kgdb</constant></para>
566 <para>Now disconnect your terminal program and connect gdb in its place</para></listitem>
567 <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem>
568 </orderedlist>
569 </sect2>
570 </sect1>
571 <sect1>
572 <title>Running kdb commands from gdb</title>
573 <para>It is possible to run a limited set of kdb commands from gdb,
574 using the gdb monitor command. You don't want to execute any of the
575 run control or breakpoint operations, because it can disrupt the
576 state of the kernel debugger. You should be using gdb for
577 breakpoints and run control operations if you have gdb connected.
578 The more useful commands to run are things like lsmod, dmesg, ps or
579 possibly some of the memory information commands. To see all the kdb
580 commands you can run <constant>monitor help</constant>.</para>
581 <para>Example:
582 <informalexample><programlisting>
583(gdb) monitor ps
5841 idle process (state I) and
58527 sleeping system daemon (state M) processes suppressed,
586use 'ps A' to see all.
587Task Addr Pid Parent [*] cpu State Thread Command
588
5890xc78291d0 1 0 0 0 S 0xc7829404 init
5900xc7954150 942 1 0 0 S 0xc7954384 dropbear
5910xc78789c0 944 1 0 0 S 0xc7878bf4 sh
592(gdb)
593 </programlisting></informalexample>
594 </para>
595 </sect1>
286 </chapter> 596 </chapter>
287 <chapter id="KGDBTestSuite"> 597 <chapter id="KGDBTestSuite">
288 <title>kgdb Test Suite</title> 598 <title>kgdb Test Suite</title>
@@ -309,34 +619,36 @@
309 </para> 619 </para>
310 </chapter> 620 </chapter>
311 <chapter id="CommonBackEndReq"> 621 <chapter id="CommonBackEndReq">
312 <title>KGDB Internals</title> 622 <title>Kernel Debugger Internals</title>
313 <sect1 id="kgdbArchitecture"> 623 <sect1 id="kgdbArchitecture">
314 <title>Architecture Specifics</title> 624 <title>Architecture Specifics</title>
315 <para> 625 <para>
316 Kgdb is organized into three basic components: 626 The kernel debugger is organized into a number of components:
317 <orderedlist> 627 <orderedlist>
318 <listitem><para>kgdb core</para> 628 <listitem><para>The debug core</para>
319 <para> 629 <para>
320 The kgdb core is found in kernel/kgdb.c. It contains: 630 The debug core is found in kernel/debugger/debug_core.c. It contains:
321 <itemizedlist> 631 <itemizedlist>
322 <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> 632 <listitem><para>A generic OS exception handler which includes
323 <listitem><para>A generic OS exception handler which includes sync'ing the processors into a stopped state on an multi cpu system.</para></listitem> 633 sync'ing the processors into a stopped state on an multi-CPU
634 system.</para></listitem>
324 <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> 635 <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
325 <listitem><para>The API to make calls to the arch specific kgdb implementation</para></listitem> 636 <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem>
326 <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> 637 <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
327 <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> 638 <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
639 <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem>
328 </itemizedlist> 640 </itemizedlist>
329 </para> 641 </para>
330 </listitem> 642 </listitem>
331 <listitem><para>kgdb arch specific implementation</para> 643 <listitem><para>kgdb arch-specific implementation</para>
332 <para> 644 <para>
333 This implementation is generally found in arch/*/kernel/kgdb.c. 645 This implementation is generally found in arch/*/kernel/kgdb.c.
334 As an example, arch/x86/kernel/kgdb.c contains the specifics to 646 As an example, arch/x86/kernel/kgdb.c contains the specifics to
335 implement HW breakpoint as well as the initialization to 647 implement HW breakpoint as well as the initialization to
336 dynamically register and unregister for the trap handlers on 648 dynamically register and unregister for the trap handlers on
337 this architecture. The arch specific portion implements: 649 this architecture. The arch-specific portion implements:
338 <itemizedlist> 650 <itemizedlist>
339 <listitem><para>contains an arch specific trap catcher which 651 <listitem><para>contains an arch-specific trap catcher which
340 invokes kgdb_handle_exception() to start kgdb about doing its 652 invokes kgdb_handle_exception() to start kgdb about doing its
341 work</para></listitem> 653 work</para></listitem>
342 <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> 654 <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
@@ -347,11 +659,35 @@
347 </itemizedlist> 659 </itemizedlist>
348 </para> 660 </para>
349 </listitem> 661 </listitem>
662 <listitem><para>gdbstub frontend (aka kgdb)</para>
663 <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para>
664 <itemizedlist>
665 <listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
666 </itemizedlist>
667 </listitem>
668 <listitem><para>kdb frontend</para>
669 <para>The kdb debugger shell is broken down into a number of
670 components. The kdb core is located in kernel/debug/kdb. There
671 are a number of helper functions in some of the other kernel
672 components to make it possible for kdb to examine and report
673 information about the kernel without taking locks that could
674 cause a kernel deadlock. The kdb core contains implements the following functionality.</para>
675 <itemizedlist>
676 <listitem><para>A simple shell</para></listitem>
677 <listitem><para>The kdb core command set</para></listitem>
678 <listitem><para>A registration API to register additional kdb shell commands.</para>
679 <para>A good example of a self-contained kdb module is the "ftdump" command for dumping the ftrace buffer. See: kernel/trace/trace_kdb.c</para></listitem>
680 <listitem><para>The implementation for kdb_printf() which
681 emits messages directly to I/O drivers, bypassing the kernel
682 log.</para></listitem>
683 <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem>
684 </itemizedlist>
685 </listitem>
350 <listitem><para>kgdb I/O driver</para> 686 <listitem><para>kgdb I/O driver</para>
351 <para> 687 <para>
352 Each kgdb I/O driver has to provide an implemenation for the following: 688 Each kgdb I/O driver has to provide an implementation for the following:
353 <itemizedlist> 689 <itemizedlist>
354 <listitem><para>configuration via builtin or module</para></listitem> 690 <listitem><para>configuration via built-in or module</para></listitem>
355 <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> 691 <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
356 <listitem><para>read and write character interface</para></listitem> 692 <listitem><para>read and write character interface</para></listitem>
357 <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> 693 <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
@@ -416,15 +752,15 @@
416 underlying low level to the hardware driver having "polling hooks" 752 underlying low level to the hardware driver having "polling hooks"
417 which the to which the tty driver is attached. In the initial 753 which the to which the tty driver is attached. In the initial
418 implementation of kgdboc it the serial_core was changed to expose a 754 implementation of kgdboc it the serial_core was changed to expose a
419 low level uart hook for doing polled mode reading and writing of a 755 low level UART hook for doing polled mode reading and writing of a
420 single character while in an atomic context. When kgdb makes an I/O 756 single character while in an atomic context. When kgdb makes an I/O
421 request to the debugger, kgdboc invokes a call back in the serial 757 request to the debugger, kgdboc invokes a call back in the serial
422 core which in turn uses the call back in the uart driver. It is 758 core which in turn uses the call back in the UART driver. It is
423 certainly possible to extend kgdboc to work with non-uart based 759 certainly possible to extend kgdboc to work with non-UART based
424 consoles in the future. 760 consoles in the future.
425 </para> 761 </para>
426 <para> 762 <para>
427 When using kgdboc with a uart, the uart driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> 763 When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
428#ifdef CONFIG_CONSOLE_POLL 764#ifdef CONFIG_CONSOLE_POLL
429 .poll_get_char = serial8250_get_poll_char, 765 .poll_get_char = serial8250_get_poll_char,
430 .poll_put_char = serial8250_put_poll_char, 766 .poll_put_char = serial8250_put_poll_char,
@@ -434,7 +770,7 @@
434 <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. 770 <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
435 Keep in mind that polling hooks have to be implemented in such a way 771 Keep in mind that polling hooks have to be implemented in such a way
436 that they can be called from an atomic context and have to restore 772 that they can be called from an atomic context and have to restore
437 the state of the uart chip on return such that the system can return 773 the state of the UART chip on return such that the system can return
438 to normal when the debugger detaches. You need to be very careful 774 to normal when the debugger detaches. You need to be very careful
439 with any kind of lock you consider, because failing here is most 775 with any kind of lock you consider, because failing here is most
440 going to mean pressing the reset button. 776 going to mean pressing the reset button.
@@ -453,6 +789,10 @@
453 <itemizedlist> 789 <itemizedlist>
454 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> 790 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
455 </itemizedlist> 791 </itemizedlist>
792 In Jan 2010 this document was updated to include kdb.
793 <itemizedlist>
794 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
795 </itemizedlist>
456 </para> 796 </para>
457 </chapter> 797 </chapter>
458</book> 798</book>
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index ff3e5bec1c24..8c5411cfeaf0 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -81,16 +81,14 @@ void (*port_disable) (struct ata_port *);
81 </programlisting> 81 </programlisting>
82 82
83 <para> 83 <para>
84 Called from ata_bus_probe() and ata_bus_reset() error paths, 84 Called from ata_bus_probe() error path, as well as when
85 as well as when unregistering from the SCSI module (rmmod, hot 85 unregistering from the SCSI module (rmmod, hot unplug).
86 unplug).
87 This function should do whatever needs to be done to take the 86 This function should do whatever needs to be done to take the
88 port out of use. In most cases, ata_port_disable() can be used 87 port out of use. In most cases, ata_port_disable() can be used
89 as this hook. 88 as this hook.
90 </para> 89 </para>
91 <para> 90 <para>
92 Called from ata_bus_probe() on a failed probe. 91 Called from ata_bus_probe() on a failed probe.
93 Called from ata_bus_reset() on a failed bus reset.
94 Called from ata_scsi_release(). 92 Called from ata_scsi_release().
95 </para> 93 </para>
96 94
@@ -227,6 +225,18 @@ u8 (*sff_check_altstatus)(struct ata_port *ap);
227 225
228 </sect2> 226 </sect2>
229 227
228 <sect2><title>Write specific ATA shadow register</title>
229 <programlisting>
230void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
231 </programlisting>
232
233 <para>
234 Write the device control ATA shadow register to the hardware.
235 Most drivers don't need to define this.
236 </para>
237
238 </sect2>
239
230 <sect2><title>Select ATA device on bus</title> 240 <sect2><title>Select ATA device on bus</title>
231 <programlisting> 241 <programlisting>
232void (*sff_dev_select)(struct ata_port *ap, unsigned int device); 242void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
@@ -477,7 +487,7 @@ void (*host_stop) (struct ata_host_set *host_set);
477 allocates space for a legacy IDE PRD table and returns. 487 allocates space for a legacy IDE PRD table and returns.
478 </para> 488 </para>
479 <para> 489 <para>
480 ->port_stop() is called after ->host_stop(). It's sole function 490 ->port_stop() is called after ->host_stop(). Its sole function
481 is to release DMA/memory resources, now that they are no longer 491 is to release DMA/memory resources, now that they are no longer
482 actively being used. Many drivers also free driver-private 492 actively being used. Many drivers also free driver-private
483 data from port at this time. 493 data from port at this time.
diff --git a/Documentation/DocBook/media-entities.tmpl b/Documentation/DocBook/media-entities.tmpl
index c725cb852c54..5d4d40f429a5 100644
--- a/Documentation/DocBook/media-entities.tmpl
+++ b/Documentation/DocBook/media-entities.tmpl
@@ -17,6 +17,7 @@
17<!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>"> 17<!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>">
18<!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>"> 18<!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>">
19<!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>"> 19<!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>">
20<!ENTITY VIDIOC-DQEVENT "<link linkend='vidioc-dqevent'><constant>VIDIOC_DQEVENT</constant></link>">
20<!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>"> 21<!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>">
21<!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>"> 22<!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>">
22<!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>"> 23<!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>">
@@ -60,6 +61,7 @@
60<!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>"> 61<!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>">
61<!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>"> 62<!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>">
62<!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>"> 63<!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>">
64<!ENTITY VIDIOC-SUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_SUBSCRIBE_EVENT</constant></link>">
63<!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>"> 65<!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>">
64<!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>"> 66<!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>">
65<!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>"> 67<!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>">
@@ -83,6 +85,7 @@
83<!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>"> 85<!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>">
84<!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>"> 86<!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>">
85<!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>"> 87<!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>">
88<!ENTITY VIDIOC-UNSUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_UNSUBSCRIBE_EVENT</constant></link>">
86 89
87<!-- Types --> 90<!-- Types -->
88<!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>"> 91<!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>">
@@ -141,6 +144,9 @@
141<!ENTITY v4l2-enc-idx "struct&nbsp;<link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>"> 144<!ENTITY v4l2-enc-idx "struct&nbsp;<link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>">
142<!ENTITY v4l2-enc-idx-entry "struct&nbsp;<link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>"> 145<!ENTITY v4l2-enc-idx-entry "struct&nbsp;<link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>">
143<!ENTITY v4l2-encoder-cmd "struct&nbsp;<link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>"> 146<!ENTITY v4l2-encoder-cmd "struct&nbsp;<link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>">
147<!ENTITY v4l2-event "struct&nbsp;<link linkend='v4l2-event'>v4l2_event</link>">
148<!ENTITY v4l2-event-subscription "struct&nbsp;<link linkend='v4l2-event-subscription'>v4l2_event_subscription</link>">
149<!ENTITY v4l2-event-vsync "struct&nbsp;<link linkend='v4l2-event-vsync'>v4l2_event_vsync</link>">
144<!ENTITY v4l2-ext-control "struct&nbsp;<link linkend='v4l2-ext-control'>v4l2_ext_control</link>"> 150<!ENTITY v4l2-ext-control "struct&nbsp;<link linkend='v4l2-ext-control'>v4l2_ext_control</link>">
145<!ENTITY v4l2-ext-controls "struct&nbsp;<link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>"> 151<!ENTITY v4l2-ext-controls "struct&nbsp;<link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>">
146<!ENTITY v4l2-fmtdesc "struct&nbsp;<link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>"> 152<!ENTITY v4l2-fmtdesc "struct&nbsp;<link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>">
@@ -200,6 +206,7 @@
200<!ENTITY sub-controls SYSTEM "v4l/controls.xml"> 206<!ENTITY sub-controls SYSTEM "v4l/controls.xml">
201<!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml"> 207<!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml">
202<!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml"> 208<!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml">
209<!ENTITY sub-dev-event SYSTEM "v4l/dev-event.xml">
203<!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml"> 210<!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml">
204<!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml"> 211<!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml">
205<!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml"> 212<!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml">
@@ -292,6 +299,8 @@
292<!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml"> 299<!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml">
293<!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml"> 300<!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml">
294<!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml"> 301<!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml">
302<!ENTITY sub-dqevent SYSTEM "v4l/vidioc-dqevent.xml">
303<!ENTITY sub-subscribe-event SYSTEM "v4l/vidioc-subscribe-event.xml">
295<!ENTITY sub-intro SYSTEM "dvb/intro.xml"> 304<!ENTITY sub-intro SYSTEM "dvb/intro.xml">
296<!ENTITY sub-frontend SYSTEM "dvb/frontend.xml"> 305<!ENTITY sub-frontend SYSTEM "dvb/frontend.xml">
297<!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml"> 306<!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml">
@@ -381,3 +390,5 @@
381<!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml"> 390<!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml">
382<!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml"> 391<!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml">
383<!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml"> 392<!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml">
393<!ENTITY dqevent SYSTEM "v4l/vidioc-dqevent.xml">
394<!ENTITY subscribe_event SYSTEM "v4l/vidioc-subscribe-event.xml">
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 133cd6c3f3c1..020ac80d4682 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -269,7 +269,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd)
269 information about the device. 269 information about the device.
270 </para> 270 </para>
271 <programlisting> 271 <programlisting>
272int __init board_init (void) 272static int __init board_init (void)
273{ 273{
274 struct nand_chip *this; 274 struct nand_chip *this;
275 int err = 0; 275 int err = 0;
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
index 0c3dc4c69dd1..d858d92cf6d9 100644
--- a/Documentation/DocBook/sh.tmpl
+++ b/Documentation/DocBook/sh.tmpl
@@ -19,13 +19,17 @@
19 </authorgroup> 19 </authorgroup>
20 20
21 <copyright> 21 <copyright>
22 <year>2008</year> 22 <year>2008-2010</year>
23 <holder>Paul Mundt</holder> 23 <holder>Paul Mundt</holder>
24 </copyright> 24 </copyright>
25 <copyright> 25 <copyright>
26 <year>2008</year> 26 <year>2008-2010</year>
27 <holder>Renesas Technology Corp.</holder> 27 <holder>Renesas Technology Corp.</holder>
28 </copyright> 28 </copyright>
29 <copyright>
30 <year>2010</year>
31 <holder>Renesas Electronics Corp.</holder>
32 </copyright>
29 33
30 <legalnotice> 34 <legalnotice>
31 <para> 35 <para>
@@ -77,7 +81,7 @@
77 </chapter> 81 </chapter>
78 <chapter id="clk"> 82 <chapter id="clk">
79 <title>Clock Framework Extensions</title> 83 <title>Clock Framework Extensions</title>
80!Iarch/sh/include/asm/clock.h 84!Iinclude/linux/sh_clk.h
81 </chapter> 85 </chapter>
82 <chapter id="mach"> 86 <chapter id="mach">
83 <title>Machine Specific Interfaces</title> 87 <title>Machine Specific Interfaces</title>
diff --git a/Documentation/DocBook/v4l/compat.xml b/Documentation/DocBook/v4l/compat.xml
index b9dbdf9e6d29..b42b935913cd 100644
--- a/Documentation/DocBook/v4l/compat.xml
+++ b/Documentation/DocBook/v4l/compat.xml
@@ -2332,15 +2332,26 @@ more information.</para>
2332 </listitem> 2332 </listitem>
2333 </orderedlist> 2333 </orderedlist>
2334 </section> 2334 </section>
2335 </section> 2335 <section>
2336 <title>V4L2 in Linux 2.6.34</title>
2337 <orderedlist>
2338 <listitem>
2339 <para>Added
2340<constant>V4L2_CID_IRIS_ABSOLUTE</constant> and
2341<constant>V4L2_CID_IRIS_RELATIVE</constant> controls to the
2342 <link linkend="camera-controls">Camera controls class</link>.
2343 </para>
2344 </listitem>
2345 </orderedlist>
2346 </section>
2336 2347
2337 <section id="other"> 2348 <section id="other">
2338 <title>Relation of V4L2 to other Linux multimedia APIs</title> 2349 <title>Relation of V4L2 to other Linux multimedia APIs</title>
2339 2350
2340 <section id="xvideo"> 2351 <section id="xvideo">
2341 <title>X Video Extension</title> 2352 <title>X Video Extension</title>
2342 2353
2343 <para>The X Video Extension (abbreviated XVideo or just Xv) is 2354 <para>The X Video Extension (abbreviated XVideo or just Xv) is
2344an extension of the X Window system, implemented for example by the 2355an extension of the X Window system, implemented for example by the
2345XFree86 project. Its scope is similar to V4L2, an API to video capture 2356XFree86 project. Its scope is similar to V4L2, an API to video capture
2346and output devices for X clients. Xv allows applications to display 2357and output devices for X clients. Xv allows applications to display
@@ -2351,7 +2362,7 @@ capture or output still images in XPixmaps<footnote>
2351extension available across many operating systems and 2362extension available across many operating systems and
2352architectures.</para> 2363architectures.</para>
2353 2364
2354 <para>Because the driver is embedded into the X server Xv has a 2365 <para>Because the driver is embedded into the X server Xv has a
2355number of advantages over the V4L2 <link linkend="overlay">video 2366number of advantages over the V4L2 <link linkend="overlay">video
2356overlay interface</link>. The driver can easily determine the overlay 2367overlay interface</link>. The driver can easily determine the overlay
2357target, &ie; visible graphics memory or off-screen buffers for a 2368target, &ie; visible graphics memory or off-screen buffers for a
@@ -2360,16 +2371,16 @@ overlay, scaling or color-keying, or the clipping functions of the
2360video capture hardware, always in sync with drawing operations or 2371video capture hardware, always in sync with drawing operations or
2361windows moving or changing their stacking order.</para> 2372windows moving or changing their stacking order.</para>
2362 2373
2363 <para>To combine the advantages of Xv and V4L a special Xv 2374 <para>To combine the advantages of Xv and V4L a special Xv
2364driver exists in XFree86 and XOrg, just programming any overlay capable 2375driver exists in XFree86 and XOrg, just programming any overlay capable
2365Video4Linux device it finds. To enable it 2376Video4Linux device it finds. To enable it
2366<filename>/etc/X11/XF86Config</filename> must contain these lines:</para> 2377<filename>/etc/X11/XF86Config</filename> must contain these lines:</para>
2367 <para><screen> 2378 <para><screen>
2368Section "Module" 2379Section "Module"
2369 Load "v4l" 2380 Load "v4l"
2370EndSection</screen></para> 2381EndSection</screen></para>
2371 2382
2372 <para>As of XFree86 4.2 this driver still supports only V4L 2383 <para>As of XFree86 4.2 this driver still supports only V4L
2373ioctls, however it should work just fine with all V4L2 devices through 2384ioctls, however it should work just fine with all V4L2 devices through
2374the V4L2 backward-compatibility layer. Since V4L2 permits multiple 2385the V4L2 backward-compatibility layer. Since V4L2 permits multiple
2375opens it is possible (if supported by the V4L2 driver) to capture 2386opens it is possible (if supported by the V4L2 driver) to capture
@@ -2377,83 +2388,84 @@ video while an X client requested video overlay. Restrictions of
2377simultaneous capturing and overlay are discussed in <xref 2388simultaneous capturing and overlay are discussed in <xref
2378 linkend="overlay" /> apply.</para> 2389 linkend="overlay" /> apply.</para>
2379 2390
2380 <para>Only marginally related to V4L2, XFree86 extended Xv to 2391 <para>Only marginally related to V4L2, XFree86 extended Xv to
2381support hardware YUV to RGB conversion and scaling for faster video 2392support hardware YUV to RGB conversion and scaling for faster video
2382playback, and added an interface to MPEG-2 decoding hardware. This API 2393playback, and added an interface to MPEG-2 decoding hardware. This API
2383is useful to display images captured with V4L2 devices.</para> 2394is useful to display images captured with V4L2 devices.</para>
2384 </section> 2395 </section>
2385 2396
2386 <section> 2397 <section>
2387 <title>Digital Video</title> 2398 <title>Digital Video</title>
2388 2399
2389 <para>V4L2 does not support digital terrestrial, cable or 2400 <para>V4L2 does not support digital terrestrial, cable or
2390satellite broadcast. A separate project aiming at digital receivers 2401satellite broadcast. A separate project aiming at digital receivers
2391exists. You can find its homepage at <ulink 2402exists. You can find its homepage at <ulink
2392url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API 2403url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API
2393has no connection to the V4L2 API except that drivers for hybrid 2404has no connection to the V4L2 API except that drivers for hybrid
2394hardware may support both.</para> 2405hardware may support both.</para>
2395 </section> 2406 </section>
2396 2407
2397 <section> 2408 <section>
2398 <title>Audio Interfaces</title> 2409 <title>Audio Interfaces</title>
2399 2410
2400 <para>[to do - OSS/ALSA]</para> 2411 <para>[to do - OSS/ALSA]</para>
2412 </section>
2401 </section> 2413 </section>
2402 </section>
2403 2414
2404 <section id="experimental"> 2415 <section id="experimental">
2405 <title>Experimental API Elements</title> 2416 <title>Experimental API Elements</title>
2406 2417
2407 <para>The following V4L2 API elements are currently experimental 2418 <para>The following V4L2 API elements are currently experimental
2408and may change in the future.</para> 2419and may change in the future.</para>
2409 2420
2410 <itemizedlist> 2421 <itemizedlist>
2411 <listitem> 2422 <listitem>
2412 <para>Video Output Overlay (OSD) Interface, <xref 2423 <para>Video Output Overlay (OSD) Interface, <xref
2413 linkend="osd" />.</para> 2424 linkend="osd" />.</para>
2414 </listitem> 2425 </listitem>
2415 <listitem> 2426 <listitem>
2416 <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>, 2427 <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>,
2417 &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para> 2428 &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para>
2418 </listitem> 2429 </listitem>
2419 <listitem> 2430 <listitem>
2420 <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>, 2431 <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>,
2421&VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para> 2432&VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para>
2422 </listitem> 2433 </listitem>
2423 <listitem> 2434 <listitem>
2424 <para>&VIDIOC-ENUM-FRAMESIZES; and 2435 <para>&VIDIOC-ENUM-FRAMESIZES; and
2425&VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para> 2436&VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para>
2426 </listitem> 2437 </listitem>
2427 <listitem> 2438 <listitem>
2428 <para>&VIDIOC-G-ENC-INDEX; ioctl.</para> 2439 <para>&VIDIOC-G-ENC-INDEX; ioctl.</para>
2429 </listitem> 2440 </listitem>
2430 <listitem> 2441 <listitem>
2431 <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD; 2442 <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD;
2432ioctls.</para> 2443ioctls.</para>
2433 </listitem> 2444 </listitem>
2434 <listitem> 2445 <listitem>
2435 <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER; 2446 <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER;
2436ioctls.</para> 2447ioctls.</para>
2437 </listitem> 2448 </listitem>
2438 <listitem> 2449 <listitem>
2439 <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para> 2450 <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para>
2440 </listitem> 2451 </listitem>
2441 </itemizedlist> 2452 </itemizedlist>
2442 </section> 2453 </section>
2443 2454
2444 <section id="obsolete"> 2455 <section id="obsolete">
2445 <title>Obsolete API Elements</title> 2456 <title>Obsolete API Elements</title>
2446 2457
2447 <para>The following V4L2 API elements were superseded by new 2458 <para>The following V4L2 API elements were superseded by new
2448interfaces and should not be implemented in new drivers.</para> 2459interfaces and should not be implemented in new drivers.</para>
2449 2460
2450 <itemizedlist> 2461 <itemizedlist>
2451 <listitem> 2462 <listitem>
2452 <para><constant>VIDIOC_G_MPEGCOMP</constant> and 2463 <para><constant>VIDIOC_G_MPEGCOMP</constant> and
2453<constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls, 2464<constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls,
2454<xref linkend="extended-controls" />.</para> 2465<xref linkend="extended-controls" />.</para>
2455 </listitem> 2466 </listitem>
2456 </itemizedlist> 2467 </itemizedlist>
2468 </section>
2457 </section> 2469 </section>
2458 2470
2459 <!-- 2471 <!--
diff --git a/Documentation/DocBook/v4l/controls.xml b/Documentation/DocBook/v4l/controls.xml
index f46450610412..8408caaee276 100644
--- a/Documentation/DocBook/v4l/controls.xml
+++ b/Documentation/DocBook/v4l/controls.xml
@@ -267,6 +267,12 @@ minimum value disables backlight compensation.</entry>
267 <entry>Chroma automatic gain control.</entry> 267 <entry>Chroma automatic gain control.</entry>
268 </row> 268 </row>
269 <row> 269 <row>
270 <entry><constant>V4L2_CID_CHROMA_GAIN</constant></entry>
271 <entry>integer</entry>
272 <entry>Adjusts the Chroma gain control (for use when chroma AGC
273 is disabled).</entry>
274 </row>
275 <row>
270 <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry> 276 <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry>
271 <entry>boolean</entry> 277 <entry>boolean</entry>
272 <entry>Enable the color killer (&ie; force a black &amp; white image in case of a weak video signal).</entry> 278 <entry>Enable the color killer (&ie; force a black &amp; white image in case of a weak video signal).</entry>
@@ -277,8 +283,15 @@ minimum value disables backlight compensation.</entry>
277 <entry>Selects a color effect. Possible values for 283 <entry>Selects a color effect. Possible values for
278<constant>enum v4l2_colorfx</constant> are: 284<constant>enum v4l2_colorfx</constant> are:
279<constant>V4L2_COLORFX_NONE</constant> (0), 285<constant>V4L2_COLORFX_NONE</constant> (0),
280<constant>V4L2_COLORFX_BW</constant> (1) and 286<constant>V4L2_COLORFX_BW</constant> (1),
281<constant>V4L2_COLORFX_SEPIA</constant> (2).</entry> 287<constant>V4L2_COLORFX_SEPIA</constant> (2),
288<constant>V4L2_COLORFX_NEGATIVE</constant> (3),
289<constant>V4L2_COLORFX_EMBOSS</constant> (4),
290<constant>V4L2_COLORFX_SKETCH</constant> (5),
291<constant>V4L2_COLORFX_SKY_BLUE</constant> (6),
292<constant>V4L2_COLORFX_GRASS_GREEN</constant> (7),
293<constant>V4L2_COLORFX_SKIN_WHITEN</constant> (8) and
294<constant>V4L2_COLORFX_VIVID</constant> (9).</entry>
282 </row> 295 </row>
283 <row> 296 <row>
284 <entry><constant>V4L2_CID_ROTATE</constant></entry> 297 <entry><constant>V4L2_CID_ROTATE</constant></entry>
@@ -1825,6 +1838,25 @@ wide-angle direction. The zoom speed unit is driver-specific.</entry>
1825 <row><entry></entry></row> 1838 <row><entry></entry></row>
1826 1839
1827 <row> 1840 <row>
1841 <entry spanname="id"><constant>V4L2_CID_IRIS_ABSOLUTE</constant>&nbsp;</entry>
1842 <entry>integer</entry>
1843 </row><row><entry spanname="descr">This control sets the
1844camera's aperture to the specified value. The unit is undefined.
1845Larger values open the iris wider, smaller values close it.</entry>
1846 </row>
1847 <row><entry></entry></row>
1848
1849 <row>
1850 <entry spanname="id"><constant>V4L2_CID_IRIS_RELATIVE</constant>&nbsp;</entry>
1851 <entry>integer</entry>
1852 </row><row><entry spanname="descr">This control modifies the
1853camera's aperture by the specified amount. The unit is undefined.
1854Positive values open the iris one step further, negative values close
1855it one step further. This is a write-only control.</entry>
1856 </row>
1857 <row><entry></entry></row>
1858
1859 <row>
1828 <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant>&nbsp;</entry> 1860 <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant>&nbsp;</entry>
1829 <entry>boolean</entry> 1861 <entry>boolean</entry>
1830 </row><row><entry spanname="descr">Prevent video from being acquired 1862 </row><row><entry spanname="descr">Prevent video from being acquired
diff --git a/Documentation/DocBook/v4l/dev-event.xml b/Documentation/DocBook/v4l/dev-event.xml
new file mode 100644
index 000000000000..be5a98fb4fab
--- /dev/null
+++ b/Documentation/DocBook/v4l/dev-event.xml
@@ -0,0 +1,31 @@
1 <title>Event Interface</title>
2
3 <para>The V4L2 event interface provides means for user to get
4 immediately notified on certain conditions taking place on a device.
5 This might include start of frame or loss of signal events, for
6 example.
7 </para>
8
9 <para>To receive events, the events the user is interested in first must
10 be subscribed using the &VIDIOC-SUBSCRIBE-EVENT; ioctl. Once an event is
11 subscribed, the events of subscribed types are dequeueable using the
12 &VIDIOC-DQEVENT; ioctl. Events may be unsubscribed using
13 VIDIOC_UNSUBSCRIBE_EVENT ioctl. The special event type V4L2_EVENT_ALL may
14 be used to unsubscribe all the events the driver supports.</para>
15
16 <para>The event subscriptions and event queues are specific to file
17 handles. Subscribing an event on one file handle does not affect
18 other file handles.
19 </para>
20
21 <para>The information on dequeueable events is obtained by using select or
22 poll system calls on video devices. The V4L2 events use POLLPRI events on
23 poll system call and exceptions on select system call. </para>
24
25 <!--
26Local Variables:
27mode: sgml
28sgml-parent-document: "v4l2.sgml"
29indent-tabs-mode: nil
30End:
31 -->
diff --git a/Documentation/DocBook/v4l/io.xml b/Documentation/DocBook/v4l/io.xml
index e870330cbf77..d424886beda0 100644
--- a/Documentation/DocBook/v4l/io.xml
+++ b/Documentation/DocBook/v4l/io.xml
@@ -702,6 +702,16 @@ They can be both cleared however, then the buffer is in "dequeued"
702state, in the application domain to say so.</entry> 702state, in the application domain to say so.</entry>
703 </row> 703 </row>
704 <row> 704 <row>
705 <entry><constant>V4L2_BUF_FLAG_ERROR</constant></entry>
706 <entry>0x0040</entry>
707 <entry>When this flag is set, the buffer has been dequeued
708 successfully, although the data might have been corrupted.
709 This is recoverable, streaming may continue as normal and
710 the buffer may be reused normally.
711 Drivers set this flag when the <constant>VIDIOC_DQBUF</constant>
712 ioctl is called.</entry>
713 </row>
714 <row>
705 <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry> 715 <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry>
706 <entry>0x0008</entry> 716 <entry>0x0008</entry>
707 <entry>Drivers set or clear this flag when calling the 717 <entry>Drivers set or clear this flag when calling the
@@ -918,8 +928,8 @@ order</emphasis>.</para>
918 928
919 <para>When the driver provides or accepts images field by field 929 <para>When the driver provides or accepts images field by field
920rather than interleaved, it is also important applications understand 930rather than interleaved, it is also important applications understand
921how the fields combine to frames. We distinguish between top and 931how the fields combine to frames. We distinguish between top (aka odd) and
922bottom fields, the <emphasis>spatial order</emphasis>: The first line 932bottom (aka even) fields, the <emphasis>spatial order</emphasis>: The first line
923of the top field is the first line of an interlaced frame, the first 933of the top field is the first line of an interlaced frame, the first
924line of the bottom field is the second line of that frame.</para> 934line of the bottom field is the second line of that frame.</para>
925 935
@@ -972,12 +982,12 @@ between <constant>V4L2_FIELD_TOP</constant> and
972 <row> 982 <row>
973 <entry><constant>V4L2_FIELD_TOP</constant></entry> 983 <entry><constant>V4L2_FIELD_TOP</constant></entry>
974 <entry>2</entry> 984 <entry>2</entry>
975 <entry>Images consist of the top field only.</entry> 985 <entry>Images consist of the top (aka odd) field only.</entry>
976 </row> 986 </row>
977 <row> 987 <row>
978 <entry><constant>V4L2_FIELD_BOTTOM</constant></entry> 988 <entry><constant>V4L2_FIELD_BOTTOM</constant></entry>
979 <entry>3</entry> 989 <entry>3</entry>
980 <entry>Images consist of the bottom field only. 990 <entry>Images consist of the bottom (aka even) field only.
981Applications may wish to prevent a device from capturing interlaced 991Applications may wish to prevent a device from capturing interlaced
982images because they will have "comb" or "feathering" artefacts around 992images because they will have "comb" or "feathering" artefacts around
983moving objects.</entry> 993moving objects.</entry>
diff --git a/Documentation/DocBook/v4l/pixfmt.xml b/Documentation/DocBook/v4l/pixfmt.xml
index 885968d6a2fc..c4ad0a8e42dc 100644
--- a/Documentation/DocBook/v4l/pixfmt.xml
+++ b/Documentation/DocBook/v4l/pixfmt.xml
@@ -792,6 +792,18 @@ http://www.thedirks.org/winnov/</ulink></para></entry>
792 <entry>'YYUV'</entry> 792 <entry>'YYUV'</entry>
793 <entry>unknown</entry> 793 <entry>unknown</entry>
794 </row> 794 </row>
795 <row id="V4L2-PIX-FMT-Y4">
796 <entry><constant>V4L2_PIX_FMT_Y4</constant></entry>
797 <entry>'Y04 '</entry>
798 <entry>Old 4-bit greyscale format. Only the least significant 4 bits of each byte are used,
799the other bits are set to 0.</entry>
800 </row>
801 <row id="V4L2-PIX-FMT-Y6">
802 <entry><constant>V4L2_PIX_FMT_Y6</constant></entry>
803 <entry>'Y06 '</entry>
804 <entry>Old 6-bit greyscale format. Only the least significant 6 bits of each byte are used,
805the other bits are set to 0.</entry>
806 </row>
795 </tbody> 807 </tbody>
796 </tgroup> 808 </tgroup>
797 </table> 809 </table>
diff --git a/Documentation/DocBook/v4l/v4l2.xml b/Documentation/DocBook/v4l/v4l2.xml
index 060105af49e5..9737243377a3 100644
--- a/Documentation/DocBook/v4l/v4l2.xml
+++ b/Documentation/DocBook/v4l/v4l2.xml
@@ -401,6 +401,7 @@ and discussions on the V4L mailing list.</revremark>
401 <section id="ttx"> &sub-dev-teletext; </section> 401 <section id="ttx"> &sub-dev-teletext; </section>
402 <section id="radio"> &sub-dev-radio; </section> 402 <section id="radio"> &sub-dev-radio; </section>
403 <section id="rds"> &sub-dev-rds; </section> 403 <section id="rds"> &sub-dev-rds; </section>
404 <section id="event"> &sub-dev-event; </section>
404 </chapter> 405 </chapter>
405 406
406 <chapter id="driver"> 407 <chapter id="driver">
@@ -426,6 +427,7 @@ and discussions on the V4L mailing list.</revremark>
426 &sub-cropcap; 427 &sub-cropcap;
427 &sub-dbg-g-chip-ident; 428 &sub-dbg-g-chip-ident;
428 &sub-dbg-g-register; 429 &sub-dbg-g-register;
430 &sub-dqevent;
429 &sub-encoder-cmd; 431 &sub-encoder-cmd;
430 &sub-enumaudio; 432 &sub-enumaudio;
431 &sub-enumaudioout; 433 &sub-enumaudioout;
@@ -467,6 +469,7 @@ and discussions on the V4L mailing list.</revremark>
467 &sub-reqbufs; 469 &sub-reqbufs;
468 &sub-s-hw-freq-seek; 470 &sub-s-hw-freq-seek;
469 &sub-streamon; 471 &sub-streamon;
472 &sub-subscribe-event;
470 <!-- End of ioctls. --> 473 <!-- End of ioctls. -->
471 &sub-mmap; 474 &sub-mmap;
472 &sub-munmap; 475 &sub-munmap;
diff --git a/Documentation/DocBook/v4l/videodev2.h.xml b/Documentation/DocBook/v4l/videodev2.h.xml
index 068325940658..865b06d9e679 100644
--- a/Documentation/DocBook/v4l/videodev2.h.xml
+++ b/Documentation/DocBook/v4l/videodev2.h.xml
@@ -1018,6 +1018,13 @@ enum <link linkend="v4l2-colorfx">v4l2_colorfx</link> {
1018 V4L2_COLORFX_NONE = 0, 1018 V4L2_COLORFX_NONE = 0,
1019 V4L2_COLORFX_BW = 1, 1019 V4L2_COLORFX_BW = 1,
1020 V4L2_COLORFX_SEPIA = 2, 1020 V4L2_COLORFX_SEPIA = 2,
1021 V4L2_COLORFX_NEGATIVE = 3,
1022 V4L2_COLORFX_EMBOSS = 4,
1023 V4L2_COLORFX_SKETCH = 5,
1024 V4L2_COLORFX_SKY_BLUE = 6,
1025 V4L2_COLORFX_GRASS_GREEN = 7,
1026 V4L2_COLORFX_SKIN_WHITEN = 8,
1027 V4L2_COLORFX_VIVID = 9.
1021}; 1028};
1022#define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32) 1029#define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32)
1023#define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33) 1030#define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33)
@@ -1271,6 +1278,9 @@ enum <link linkend="v4l2-exposure-auto-type">v4l2_exposure_auto_type</link> {
1271 1278
1272#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) 1279#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16)
1273 1280
1281#define V4L2_CID_IRIS_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+17)
1282#define V4L2_CID_IRIS_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+18)
1283
1274/* FM Modulator class control IDs */ 1284/* FM Modulator class control IDs */
1275#define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900) 1285#define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900)
1276#define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1) 1286#define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1)
diff --git a/Documentation/DocBook/v4l/vidioc-dqevent.xml b/Documentation/DocBook/v4l/vidioc-dqevent.xml
new file mode 100644
index 000000000000..4e0a7cc30812
--- /dev/null
+++ b/Documentation/DocBook/v4l/vidioc-dqevent.xml
@@ -0,0 +1,131 @@
1<refentry id="vidioc-dqevent">
2 <refmeta>
3 <refentrytitle>ioctl VIDIOC_DQEVENT</refentrytitle>
4 &manvol;
5 </refmeta>
6
7 <refnamediv>
8 <refname>VIDIOC_DQEVENT</refname>
9 <refpurpose>Dequeue event</refpurpose>
10 </refnamediv>
11
12 <refsynopsisdiv>
13 <funcsynopsis>
14 <funcprototype>
15 <funcdef>int <function>ioctl</function></funcdef>
16 <paramdef>int <parameter>fd</parameter></paramdef>
17 <paramdef>int <parameter>request</parameter></paramdef>
18 <paramdef>struct v4l2_event
19*<parameter>argp</parameter></paramdef>
20 </funcprototype>
21 </funcsynopsis>
22 </refsynopsisdiv>
23
24 <refsect1>
25 <title>Arguments</title>
26
27 <variablelist>
28 <varlistentry>
29 <term><parameter>fd</parameter></term>
30 <listitem>
31 <para>&fd;</para>
32 </listitem>
33 </varlistentry>
34 <varlistentry>
35 <term><parameter>request</parameter></term>
36 <listitem>
37 <para>VIDIOC_DQEVENT</para>
38 </listitem>
39 </varlistentry>
40 <varlistentry>
41 <term><parameter>argp</parameter></term>
42 <listitem>
43 <para></para>
44 </listitem>
45 </varlistentry>
46 </variablelist>
47 </refsect1>
48
49 <refsect1>
50 <title>Description</title>
51
52 <para>Dequeue an event from a video device. No input is required
53 for this ioctl. All the fields of the &v4l2-event; structure are
54 filled by the driver. The file handle will also receive exceptions
55 which the application may get by e.g. using the select system
56 call.</para>
57
58 <table frame="none" pgwide="1" id="v4l2-event">
59 <title>struct <structname>v4l2_event</structname></title>
60 <tgroup cols="4">
61 &cs-str;
62 <tbody valign="top">
63 <row>
64 <entry>__u32</entry>
65 <entry><structfield>type</structfield></entry>
66 <entry></entry>
67 <entry>Type of the event.</entry>
68 </row>
69 <row>
70 <entry>union</entry>
71 <entry><structfield>u</structfield></entry>
72 <entry></entry>
73 <entry></entry>
74 </row>
75 <row>
76 <entry></entry>
77 <entry>&v4l2-event-vsync;</entry>
78 <entry><structfield>vsync</structfield></entry>
79 <entry>Event data for event V4L2_EVENT_VSYNC.
80 </entry>
81 </row>
82 <row>
83 <entry></entry>
84 <entry>__u8</entry>
85 <entry><structfield>data</structfield>[64]</entry>
86 <entry>Event data. Defined by the event type. The union
87 should be used to define easily accessible type for
88 events.</entry>
89 </row>
90 <row>
91 <entry>__u32</entry>
92 <entry><structfield>pending</structfield></entry>
93 <entry></entry>
94 <entry>Number of pending events excluding this one.</entry>
95 </row>
96 <row>
97 <entry>__u32</entry>
98 <entry><structfield>sequence</structfield></entry>
99 <entry></entry>
100 <entry>Event sequence number. The sequence number is
101 incremented for every subscribed event that takes place.
102 If sequence numbers are not contiguous it means that
103 events have been lost.
104 </entry>
105 </row>
106 <row>
107 <entry>struct timespec</entry>
108 <entry><structfield>timestamp</structfield></entry>
109 <entry></entry>
110 <entry>Event timestamp.</entry>
111 </row>
112 <row>
113 <entry>__u32</entry>
114 <entry><structfield>reserved</structfield>[9]</entry>
115 <entry></entry>
116 <entry>Reserved for future extensions. Drivers must set
117 the array to zero.</entry>
118 </row>
119 </tbody>
120 </tgroup>
121 </table>
122
123 </refsect1>
124</refentry>
125<!--
126Local Variables:
127mode: sgml
128sgml-parent-document: "v4l2.sgml"
129indent-tabs-mode: nil
130End:
131-->
diff --git a/Documentation/DocBook/v4l/vidioc-enuminput.xml b/Documentation/DocBook/v4l/vidioc-enuminput.xml
index 71b868e2fb8f..476fe1d2bba0 100644
--- a/Documentation/DocBook/v4l/vidioc-enuminput.xml
+++ b/Documentation/DocBook/v4l/vidioc-enuminput.xml
@@ -283,7 +283,7 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009.
283 <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry> 283 <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry>
284 </row> 284 </row>
285 <row> 285 <row>
286 <entry><constant>V4L2_OUT_CAP_CUSTOM_TIMINGS</constant></entry> 286 <entry><constant>V4L2_IN_CAP_CUSTOM_TIMINGS</constant></entry>
287 <entry>0x00000002</entry> 287 <entry>0x00000002</entry>
288 <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry> 288 <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry>
289 </row> 289 </row>
diff --git a/Documentation/DocBook/v4l/vidioc-qbuf.xml b/Documentation/DocBook/v4l/vidioc-qbuf.xml
index b843bd7b3897..ab691ebf3b93 100644
--- a/Documentation/DocBook/v4l/vidioc-qbuf.xml
+++ b/Documentation/DocBook/v4l/vidioc-qbuf.xml
@@ -111,7 +111,11 @@ from the driver's outgoing queue. They just set the
111and <structfield>reserved</structfield> 111and <structfield>reserved</structfield>
112fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant> 112fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant>
113is called with a pointer to this structure the driver fills the 113is called with a pointer to this structure the driver fills the
114remaining fields or returns an error code.</para> 114remaining fields or returns an error code. The driver may also set
115<constant>V4L2_BUF_FLAG_ERROR</constant> in the <structfield>flags</structfield>
116field. It indicates a non-critical (recoverable) streaming error. In such case
117the application may continue as normal, but should be aware that data in the
118dequeued buffer might be corrupted.</para>
115 119
116 <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no 120 <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no
117buffer is in the outgoing queue. When the 121buffer is in the outgoing queue. When the
@@ -158,7 +162,13 @@ enqueue a user pointer buffer.</para>
158 <para><constant>VIDIOC_DQBUF</constant> failed due to an 162 <para><constant>VIDIOC_DQBUF</constant> failed due to an
159internal error. Can also indicate temporary problems like signal 163internal error. Can also indicate temporary problems like signal
160loss. Note the driver might dequeue an (empty) buffer despite 164loss. Note the driver might dequeue an (empty) buffer despite
161returning an error, or even stop capturing.</para> 165returning an error, or even stop capturing. Reusing such buffer may be unsafe
166though and its details (e.g. <structfield>index</structfield>) may not be
167returned either. It is recommended that drivers indicate recoverable errors
168by setting the <constant>V4L2_BUF_FLAG_ERROR</constant> and returning 0 instead.
169In that case the application should be able to safely reuse the buffer and
170continue streaming.
171 </para>
162 </listitem> 172 </listitem>
163 </varlistentry> 173 </varlistentry>
164 </variablelist> 174 </variablelist>
diff --git a/Documentation/DocBook/v4l/vidioc-queryctrl.xml b/Documentation/DocBook/v4l/vidioc-queryctrl.xml
index 4876ff1a1a04..8e0e055ac934 100644
--- a/Documentation/DocBook/v4l/vidioc-queryctrl.xml
+++ b/Documentation/DocBook/v4l/vidioc-queryctrl.xml
@@ -325,7 +325,7 @@ should be part of the control documentation.</entry>
325 <entry>n/a</entry> 325 <entry>n/a</entry>
326 <entry>This is not a control. When 326 <entry>This is not a control. When
327<constant>VIDIOC_QUERYCTRL</constant> is called with a control ID 327<constant>VIDIOC_QUERYCTRL</constant> is called with a control ID
328equal to a control class code (see <xref linkend="ctrl-class" />), the 328equal to a control class code (see <xref linkend="ctrl-class" />) + 1, the
329ioctl returns the name of the control class and this control type. 329ioctl returns the name of the control class and this control type.
330Older drivers which do not support this feature return an 330Older drivers which do not support this feature return an
331&EINVAL;.</entry> 331&EINVAL;.</entry>
diff --git a/Documentation/DocBook/v4l/vidioc-reqbufs.xml b/Documentation/DocBook/v4l/vidioc-reqbufs.xml
index 1c0816372074..69800ae23348 100644
--- a/Documentation/DocBook/v4l/vidioc-reqbufs.xml
+++ b/Documentation/DocBook/v4l/vidioc-reqbufs.xml
@@ -61,7 +61,7 @@ fields of the <structname>v4l2_requestbuffers</structname> structure.
61They set the <structfield>type</structfield> field to the respective 61They set the <structfield>type</structfield> field to the respective
62stream or buffer type, the <structfield>count</structfield> field to 62stream or buffer type, the <structfield>count</structfield> field to
63the desired number of buffers, <structfield>memory</structfield> 63the desired number of buffers, <structfield>memory</structfield>
64must be set to the requested I/O method and the reserved array 64must be set to the requested I/O method and the <structfield>reserved</structfield> array
65must be zeroed. When the ioctl 65must be zeroed. When the ioctl
66is called with a pointer to this structure the driver will attempt to allocate 66is called with a pointer to this structure the driver will attempt to allocate
67the requested number of buffers and it stores the actual number 67the requested number of buffers and it stores the actual number
diff --git a/Documentation/DocBook/v4l/vidioc-subscribe-event.xml b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml
new file mode 100644
index 000000000000..8b501791aa68
--- /dev/null
+++ b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml
@@ -0,0 +1,133 @@
1<refentry id="vidioc-subscribe-event">
2 <refmeta>
3 <refentrytitle>ioctl VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refentrytitle>
4 &manvol;
5 </refmeta>
6
7 <refnamediv>
8 <refname>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refname>
9 <refpurpose>Subscribe or unsubscribe event</refpurpose>
10 </refnamediv>
11
12 <refsynopsisdiv>
13 <funcsynopsis>
14 <funcprototype>
15 <funcdef>int <function>ioctl</function></funcdef>
16 <paramdef>int <parameter>fd</parameter></paramdef>
17 <paramdef>int <parameter>request</parameter></paramdef>
18 <paramdef>struct v4l2_event_subscription
19*<parameter>argp</parameter></paramdef>
20 </funcprototype>
21 </funcsynopsis>
22 </refsynopsisdiv>
23
24 <refsect1>
25 <title>Arguments</title>
26
27 <variablelist>
28 <varlistentry>
29 <term><parameter>fd</parameter></term>
30 <listitem>
31 <para>&fd;</para>
32 </listitem>
33 </varlistentry>
34 <varlistentry>
35 <term><parameter>request</parameter></term>
36 <listitem>
37 <para>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</para>
38 </listitem>
39 </varlistentry>
40 <varlistentry>
41 <term><parameter>argp</parameter></term>
42 <listitem>
43 <para></para>
44 </listitem>
45 </varlistentry>
46 </variablelist>
47 </refsect1>
48
49 <refsect1>
50 <title>Description</title>
51
52 <para>Subscribe or unsubscribe V4L2 event. Subscribed events are
53 dequeued by using the &VIDIOC-DQEVENT; ioctl.</para>
54
55 <table frame="none" pgwide="1" id="v4l2-event-subscription">
56 <title>struct <structname>v4l2_event_subscription</structname></title>
57 <tgroup cols="3">
58 &cs-str;
59 <tbody valign="top">
60 <row>
61 <entry>__u32</entry>
62 <entry><structfield>type</structfield></entry>
63 <entry>Type of the event.</entry>
64 </row>
65 <row>
66 <entry>__u32</entry>
67 <entry><structfield>reserved</structfield>[7]</entry>
68 <entry>Reserved for future extensions. Drivers and applications
69 must set the array to zero.</entry>
70 </row>
71 </tbody>
72 </tgroup>
73 </table>
74
75 <table frame="none" pgwide="1" id="event-type">
76 <title>Event Types</title>
77 <tgroup cols="3">
78 &cs-def;
79 <tbody valign="top">
80 <row>
81 <entry><constant>V4L2_EVENT_ALL</constant></entry>
82 <entry>0</entry>
83 <entry>All events. V4L2_EVENT_ALL is valid only for
84 VIDIOC_UNSUBSCRIBE_EVENT for unsubscribing all events at once.
85 </entry>
86 </row>
87 <row>
88 <entry><constant>V4L2_EVENT_VSYNC</constant></entry>
89 <entry>1</entry>
90 <entry>This event is triggered on the vertical sync.
91 This event has &v4l2-event-vsync; associated with it.
92 </entry>
93 </row>
94 <row>
95 <entry><constant>V4L2_EVENT_EOS</constant></entry>
96 <entry>2</entry>
97 <entry>This event is triggered when the end of a stream is reached.
98 This is typically used with MPEG decoders to report to the application
99 when the last of the MPEG stream has been decoded.
100 </entry>
101 </row>
102 <row>
103 <entry><constant>V4L2_EVENT_PRIVATE_START</constant></entry>
104 <entry>0x08000000</entry>
105 <entry>Base event number for driver-private events.</entry>
106 </row>
107 </tbody>
108 </tgroup>
109 </table>
110
111 <table frame="none" pgwide="1" id="v4l2-event-vsync">
112 <title>struct <structname>v4l2_event_vsync</structname></title>
113 <tgroup cols="3">
114 &cs-str;
115 <tbody valign="top">
116 <row>
117 <entry>__u8</entry>
118 <entry><structfield>field</structfield></entry>
119 <entry>The upcoming field. See &v4l2-field;.</entry>
120 </row>
121 </tbody>
122 </tgroup>
123 </table>
124
125 </refsect1>
126</refentry>
127<!--
128Local Variables:
129mode: sgml
130sgml-parent-document: "v4l2.sgml"
131indent-tabs-mode: nil
132End:
133-->
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl
index 0d0f7b4d4b1a..0ba149de2608 100644
--- a/Documentation/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -5518,34 +5518,41 @@ struct _snd_pcm_runtime {
5518]]> 5518]]>
5519 </programlisting> 5519 </programlisting>
5520 </informalexample> 5520 </informalexample>
5521
5522 For the raw data, <structfield>size</structfield> field must be
5523 set properly. This specifies the maximum size of the proc file access.
5521 </para> 5524 </para>
5522 5525
5523 <para> 5526 <para>
5524 The callback is much more complicated than the text-file 5527 The read/write callbacks of raw mode are more direct than the text mode.
5525 version. You need to use a low-level I/O functions such as 5528 You need to use a low-level I/O functions such as
5526 <function>copy_from/to_user()</function> to transfer the 5529 <function>copy_from/to_user()</function> to transfer the
5527 data. 5530 data.
5528 5531
5529 <informalexample> 5532 <informalexample>
5530 <programlisting> 5533 <programlisting>
5531<![CDATA[ 5534<![CDATA[
5532 static long my_file_io_read(struct snd_info_entry *entry, 5535 static ssize_t my_file_io_read(struct snd_info_entry *entry,
5533 void *file_private_data, 5536 void *file_private_data,
5534 struct file *file, 5537 struct file *file,
5535 char *buf, 5538 char *buf,
5536 unsigned long count, 5539 size_t count,
5537 unsigned long pos) 5540 loff_t pos)
5538 { 5541 {
5539 long size = count; 5542 if (copy_to_user(buf, local_data + pos, count))
5540 if (pos + size > local_max_size)
5541 size = local_max_size - pos;
5542 if (copy_to_user(buf, local_data + pos, size))
5543 return -EFAULT; 5543 return -EFAULT;
5544 return size; 5544 return count;
5545 } 5545 }
5546]]> 5546]]>
5547 </programlisting> 5547 </programlisting>
5548 </informalexample> 5548 </informalexample>
5549
5550 If the size of the info entry has been set up properly,
5551 <structfield>count</structfield> and <structfield>pos</structfield> are
5552 guaranteed to fit within 0 and the given size.
5553 You don't have to check the range in the callbacks unless any
5554 other condition is required.
5555
5549 </para> 5556 </para>
5550 5557
5551 </chapter> 5558 </chapter>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
index eeff19ca831b..bd97a13fa5ae 100644
--- a/Documentation/DocBook/writing_usb_driver.tmpl
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -342,7 +342,7 @@ static inline void skel_delete (struct usb_skel *dev)
342{ 342{
343 kfree (dev->bulk_in_buffer); 343 kfree (dev->bulk_in_buffer);
344 if (dev->bulk_out_buffer != NULL) 344 if (dev->bulk_out_buffer != NULL)
345 usb_buffer_free (dev->udev, dev->bulk_out_size, 345 usb_free_coherent (dev->udev, dev->bulk_out_size,
346 dev->bulk_out_buffer, 346 dev->bulk_out_buffer,
347 dev->write_urb->transfer_dma); 347 dev->write_urb->transfer_dma);
348 usb_free_urb (dev->write_urb); 348 usb_free_urb (dev->write_urb);
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
index e83f2ea76415..898ded24510d 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -216,7 +216,7 @@ The driver should return one of the following result codes:
216 216
217 - PCI_ERS_RESULT_NEED_RESET 217 - PCI_ERS_RESULT_NEED_RESET
218 Driver returns this if it thinks the device is not 218 Driver returns this if it thinks the device is not
219 recoverable in it's current state and it needs a slot 219 recoverable in its current state and it needs a slot
220 reset to proceed. 220 reset to proceed.
221 221
222 - PCI_ERS_RESULT_DISCONNECT 222 - PCI_ERS_RESULT_DISCONNECT
@@ -241,7 +241,7 @@ in working condition.
241 241
242The driver is not supposed to restart normal driver I/O operations 242The driver is not supposed to restart normal driver I/O operations
243at this point. It should limit itself to "probing" the device to 243at this point. It should limit itself to "probing" the device to
244check it's recoverability status. If all is right, then the platform 244check its recoverability status. If all is right, then the platform
245will call resume() once all drivers have ack'd link_reset(). 245will call resume() once all drivers have ack'd link_reset().
246 246
247 Result codes: 247 Result codes:
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index be21001ab144..26d3d945c3c2 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -13,7 +13,7 @@ Reporting (AER) driver and provides information on how to use it, as
13well as how to enable the drivers of endpoint devices to conform with 13well as how to enable the drivers of endpoint devices to conform with
14PCI Express AER driver. 14PCI Express AER driver.
15 15
161.2 Copyright © Intel Corporation 2006. 161.2 Copyright (C) Intel Corporation 2006.
17 17
181.3 What is the PCI Express AER Driver? 181.3 What is the PCI Express AER Driver?
19 19
@@ -71,15 +71,11 @@ console. If it's a correctable error, it is outputed as a warning.
71Otherwise, it is printed as an error. So users could choose different 71Otherwise, it is printed as an error. So users could choose different
72log level to filter out correctable error messages. 72log level to filter out correctable error messages.
73 73
74Below shows an example. 74Below shows an example:
75+------ PCI-Express Device Error -----+ 750000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
76Error Severity : Uncorrected (Fatal) 760000:50:00.0: device [8086:0329] error status/mask=00100000/00000000
77PCIE Bus Error type : Transaction Layer 770000:50:00.0: [20] Unsupported Request (First)
78Unsupported Request : First 780000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100
79Requester ID : 0500
80VendorID=8086h, DeviceID=0329h, Bus=05h, Device=00h, Function=00h
81TLB Header:
8204000001 00200a03 05010000 00050100
83 79
84In the example, 'Requester ID' means the ID of the device who sends 80In the example, 'Requester ID' means the ID of the device who sends
85the error message to root port. Pls. refer to pci express specs for 81the error message to root port. Pls. refer to pci express specs for
@@ -112,7 +108,7 @@ but the PCI Express link itself is fully functional. Fatal errors, on
112the other hand, cause the link to be unreliable. 108the other hand, cause the link to be unreliable.
113 109
114When AER is enabled, a PCI Express device will automatically send an 110When AER is enabled, a PCI Express device will automatically send an
115error message to the PCIE root port above it when the device captures 111error message to the PCIe root port above it when the device captures
116an error. The Root Port, upon receiving an error reporting message, 112an error. The Root Port, upon receiving an error reporting message,
117internally processes and logs the error message in its PCI Express 113internally processes and logs the error message in its PCI Express
118capability structure. Error information being logged includes storing 114capability structure. Error information being logged includes storing
@@ -198,8 +194,9 @@ to reset link, AER port service driver is required to provide the
198function to reset link. Firstly, kernel looks for if the upstream 194function to reset link. Firstly, kernel looks for if the upstream
199component has an aer driver. If it has, kernel uses the reset_link 195component has an aer driver. If it has, kernel uses the reset_link
200callback of the aer driver. If the upstream component has no aer driver 196callback of the aer driver. If the upstream component has no aer driver
201and the port is downstream port, we will use the aer driver of the 197and the port is downstream port, we will perform a hot reset as the
202root port who reports the AER error. As for upstream ports, 198default by setting the Secondary Bus Reset bit of the Bridge Control
199register associated with the downstream port. As for upstream ports,
203they should provide their own aer service drivers with reset_link 200they should provide their own aer service drivers with reset_link
204function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and 201function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
205reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes 202reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
@@ -253,11 +250,11 @@ cleanup uncorrectable status register. Pls. refer to section 3.3.
253 250
2544. Software error injection 2514. Software error injection
255 252
256Debugging PCIE AER error recovery code is quite difficult because it 253Debugging PCIe AER error recovery code is quite difficult because it
257is hard to trigger real hardware errors. Software based error 254is hard to trigger real hardware errors. Software based error
258injection can be used to fake various kinds of PCIE errors. 255injection can be used to fake various kinds of PCIe errors.
259 256
260First you should enable PCIE AER software error injection in kernel 257First you should enable PCIe AER software error injection in kernel
261configuration, that is, following item should be in your .config. 258configuration, that is, following item should be in your .config.
262 259
263CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m 260CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1423d2570d78..44c6dcc93d6d 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector
3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables 3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
4RCU's CPU stall detector, which detects conditions that unduly delay 4RCU's CPU stall detector, which detects conditions that unduly delay
5RCU grace periods. The stall detector's idea of what constitutes 5RCU grace periods. The stall detector's idea of what constitutes
6"unduly delayed" is controlled by a pair of C preprocessor macros: 6"unduly delayed" is controlled by a set of C preprocessor macros:
7 7
8RCU_SECONDS_TILL_STALL_CHECK 8RCU_SECONDS_TILL_STALL_CHECK
9 9
10 This macro defines the period of time that RCU will wait from 10 This macro defines the period of time that RCU will wait from
11 the beginning of a grace period until it issues an RCU CPU 11 the beginning of a grace period until it issues an RCU CPU
12 stall warning. It is normally ten seconds. 12 stall warning. This time period is normally ten seconds.
13 13
14RCU_SECONDS_TILL_STALL_RECHECK 14RCU_SECONDS_TILL_STALL_RECHECK
15 15
16 This macro defines the period of time that RCU will wait after 16 This macro defines the period of time that RCU will wait after
17 issuing a stall warning until it issues another stall warning. 17 issuing a stall warning until it issues another stall warning
18 It is normally set to thirty seconds. 18 for the same stall. This time period is normally set to thirty
19 seconds.
19 20
20RCU_STALL_RAT_DELAY 21RCU_STALL_RAT_DELAY
21 22
22 The CPU stall detector tries to make the offending CPU rat on itself, 23 The CPU stall detector tries to make the offending CPU print its
23 as this often gives better-quality stack traces. However, if 24 own warnings, as this often gives better-quality stack traces.
24 the offending CPU does not detect its own stall in the number 25 However, if the offending CPU does not detect its own stall in
25 of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will 26 the number of jiffies specified by RCU_STALL_RAT_DELAY, then
26 complain. This is normally set to two jiffies. 27 some other CPU will complain. This delay is normally set to
28 two jiffies.
27 29
28The following problems can result in an RCU CPU stall warning: 30When a CPU detects that it is stalling, it will print a message similar
31to the following:
32
33INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies)
34
35This message indicates that CPU 5 detected that it was causing a stall,
36and that the stall was affecting RCU-sched. This message will normally be
37followed by a stack dump of the offending CPU. On TREE_RCU kernel builds,
38RCU and RCU-sched are implemented by the same underlying mechanism,
39while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented
40by rcu_preempt_state.
41
42On the other hand, if the offending CPU fails to print out a stall-warning
43message quickly enough, some other CPU will print a message similar to
44the following:
45
46INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies)
47
48This message indicates that CPU 2 detected that CPUs 3 and 5 were both
49causing stalls, and that the stall was affecting RCU-bh. This message
50will normally be followed by stack dumps for each CPU. Please note that
51TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs,
52and that the tasks will be indicated by PID, for example, "P3421".
53It is even possible for a rcu_preempt_state stall to be caused by both
54CPUs -and- tasks, in which case the offending CPUs and tasks will all
55be called out in the list.
56
57Finally, if the grace period ends just as the stall warning starts
58printing, there will be a spurious stall-warning message:
59
60INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
61
62This is rare, but does happen from time to time in real life.
63
64So your kernel printed an RCU CPU stall warning. The next question is
65"What caused it?" The following problems can result in RCU CPU stall
66warnings:
29 67
30o A CPU looping in an RCU read-side critical section. 68o A CPU looping in an RCU read-side critical section.
31 69
32o A CPU looping with interrupts disabled. 70o A CPU looping with interrupts disabled. This condition can
71 result in RCU-sched and RCU-bh stalls.
33 72
34o A CPU looping with preemption disabled. 73o A CPU looping with preemption disabled. This condition can
74 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
75 stalls.
76
77o A CPU looping with bottom halves disabled. This condition can
78 result in RCU-sched and RCU-bh stalls.
35 79
36o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel 80o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
37 without invoking schedule(). 81 without invoking schedule().
@@ -39,20 +83,24 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
39o A bug in the RCU implementation. 83o A bug in the RCU implementation.
40 84
41o A hardware failure. This is quite unlikely, but has occurred 85o A hardware failure. This is quite unlikely, but has occurred
42 at least once in a former life. A CPU failed in a running system, 86 at least once in real life. A CPU failed in a running system,
43 becoming unresponsive, but not causing an immediate crash. 87 becoming unresponsive, but not causing an immediate crash.
44 This resulted in a series of RCU CPU stall warnings, eventually 88 This resulted in a series of RCU CPU stall warnings, eventually
45 leading the realization that the CPU had failed. 89 leading the realization that the CPU had failed.
46 90
47The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. 91The RCU, RCU-sched, and RCU-bh implementations have CPU stall
48SRCU does not do so directly, but its calls to synchronize_sched() will 92warning. SRCU does not have its own CPU stall warnings, but its
49result in RCU-sched detecting any CPU stalls that might be occurring. 93calls to synchronize_sched() will result in RCU-sched detecting
50 94RCU-sched-related CPU stalls. Please note that RCU only detects
51To diagnose the cause of the stall, inspect the stack traces. The offending 95CPU stalls when there is a grace period in progress. No grace period,
52function will usually be near the top of the stack. If you have a series 96no CPU stall warnings.
53of stall warnings from a single extended stall, comparing the stack traces 97
54can often help determine where the stall is occurring, which will usually 98To diagnose the cause of the stall, inspect the stack traces.
55be in the function nearest the top of the stack that stays the same from 99The offending function will usually be near the top of the stack.
56trace to trace. 100If you have a series of stall warnings from a single extended stall,
101comparing the stack traces can often help determine where the stall
102is occurring, which will usually be in the function nearest the top of
103that portion of the stack which remains the same from trace to trace.
104If you can reliably trigger the stall, ftrace can be quite helpful.
57 105
58RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. 106RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
185 state: -1 / 0:0 3:0 4:0
186
187As before, the first four lines are similar to those for RCU.
188The last line shows the task-migration state. The first number is
189-1 if synchronize_sched_expedited() is idle, -2 if in the process of
190posting wakeups to the migration kthreads, and N when waiting on CPU N.
191Each of the colon-separated fields following the "/" is a CPU:state pair.
192Valid states are "0" for idle, "1" for waiting for quiescent state,
193"2" for passed through quiescent state, and "3" when a race with a
194CPU-hotplug event forces use of the synchronize_sched() primitive.
195 185
196 186
197USAGE 187USAGE
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8608fd85e921..efd8cc95c06b 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
256The output of "cat rcu/rcu_pending" looks as follows: 256The output of "cat rcu/rcu_pending" looks as follows:
257 257
258rcu_sched: 258rcu_sched:
259 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 259 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
260 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 260 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
261 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 261 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
262 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 262 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
263 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 263 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
264 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 264 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
265 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 265 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
266 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 266 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
267rcu_bh: 267rcu_bh:
268 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 268 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
269 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 269 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
270 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 270 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
271 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 271 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
272 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 272 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
273 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 273 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
274 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 274 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
275 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 275 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
276 276
277As always, this is once again split into "rcu_sched" and "rcu_bh" 277As always, this is once again split into "rcu_sched" and "rcu_bh"
278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional 278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked
284o "qsp" is the number of times that the RCU was waiting for a 284o "qsp" is the number of times that the RCU was waiting for a
285 quiescent state from this CPU. 285 quiescent state from this CPU.
286 286
287o "rpq" is the number of times that the CPU had passed through
288 a quiescent state, but not yet reported it to RCU.
289
287o "cbr" is the number of times that this CPU had RCU callbacks 290o "cbr" is the number of times that this CPU had RCU callbacks
288 that had passed through a grace period, and were thus ready 291 that had passed through a grace period, and were thus ready
289 to be invoked. 292 to be invoked.
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 34614b4c708e..e9dab41c0fe0 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -73,7 +73,7 @@ NOTE: Smack labels are limited to 23 characters. The attr command
73If you don't do anything special all users will get the floor ("_") 73If you don't do anything special all users will get the floor ("_")
74label when they log in. If you do want to log in via the hacked ssh 74label when they log in. If you do want to log in via the hacked ssh
75at other labels use the attr command to set the smack value on the 75at other labels use the attr command to set the smack value on the
76home directory and it's contents. 76home directory and its contents.
77 77
78You can add access rules in /etc/smack/accesses. They take the form: 78You can add access rules in /etc/smack/accesses. They take the form:
79 79
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 8916ca48bc95..da0382daa395 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -18,6 +18,8 @@ kernel patches.
18 18
192b: Passes allnoconfig, allmodconfig 192b: Passes allnoconfig, allmodconfig
20 20
212c: Builds successfully when using O=builddir
22
213: Builds on multiple CPU architectures by using local cross-compile tools 233: Builds on multiple CPU architectures by using local cross-compile tools
22 or some other build farm. 24 or some other build farm.
23 25
@@ -95,3 +97,13 @@ kernel patches.
95 97
9625: If any ioctl's are added by the patch, then also update 9825: If any ioctl's are added by the patch, then also update
97 Documentation/ioctl/ioctl-number.txt. 99 Documentation/ioctl/ioctl-number.txt.
100
10126: If your modified source code depends on or uses any of the kernel
102 APIs or features that are related to the following kconfig symbols,
103 then test multiple builds with the related kconfig symbols disabled
104 and/or =m (if that option is available) [not all of these at the
105 same time, just various/random combinations of them]:
106
107 CONFIG_SMP, CONFIG_SYSFS, CONFIG_PROC_FS, CONFIG_INPUT, CONFIG_PCI,
108 CONFIG_BLOCK, CONFIG_PM, CONFIG_HOTPLUG, CONFIG_MAGIC_SYSRQ,
109 CONFIG_NET, CONFIG_INET=n (but latter with CONFIG_NET=y)
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index 99e72a81fa2f..4947fd8fb182 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -130,6 +130,8 @@ Linux kernel master tree:
130 ftp.??.kernel.org:/pub/linux/kernel/... 130 ftp.??.kernel.org:/pub/linux/kernel/...
131 ?? == your country code, such as "us", "uk", "fr", etc. 131 ?? == your country code, such as "us", "uk", "fr", etc.
132 132
133 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git
134
133Linux kernel mailing list: 135Linux kernel mailing list:
134 linux-kernel@vger.kernel.org 136 linux-kernel@vger.kernel.org
135 [mail majordomo@vger.kernel.org to subscribe] 137 [mail majordomo@vger.kernel.org to subscribe]
@@ -160,3 +162,6 @@ How to NOT write kernel driver by Arjan van de Ven:
160 162
161Kernel Janitor: 163Kernel Janitor:
162 http://janitor.kernelnewbies.org/ 164 http://janitor.kernelnewbies.org/
165
166GIT, Fast Version Control System:
167 http://git-scm.com/
diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt
new file mode 100644
index 000000000000..dfab71848dc8
--- /dev/null
+++ b/Documentation/acpi/apei/einj.txt
@@ -0,0 +1,59 @@
1 APEI Error INJection
2 ~~~~~~~~~~~~~~~~~~~~
3
4EINJ provides a hardware error injection mechanism
5It is very useful for debugging and testing of other APEI and RAS features.
6
7To use EINJ, make sure the following are enabled in your kernel
8configuration:
9
10CONFIG_DEBUG_FS
11CONFIG_ACPI_APEI
12CONFIG_ACPI_APEI_EINJ
13
14The user interface of EINJ is debug file system, under the
15directory apei/einj. The following files are provided.
16
17- available_error_type
18 Reading this file returns the error injection capability of the
19 platform, that is, which error types are supported. The error type
20 definition is as follow, the left field is the error type value, the
21 right field is error description.
22
23 0x00000001 Processor Correctable
24 0x00000002 Processor Uncorrectable non-fatal
25 0x00000004 Processor Uncorrectable fatal
26 0x00000008 Memory Correctable
27 0x00000010 Memory Uncorrectable non-fatal
28 0x00000020 Memory Uncorrectable fatal
29 0x00000040 PCI Express Correctable
30 0x00000080 PCI Express Uncorrectable fatal
31 0x00000100 PCI Express Uncorrectable non-fatal
32 0x00000200 Platform Correctable
33 0x00000400 Platform Uncorrectable non-fatal
34 0x00000800 Platform Uncorrectable fatal
35
36 The format of file contents are as above, except there are only the
37 available error type lines.
38
39- error_type
40 This file is used to set the error type value. The error type value
41 is defined in "available_error_type" description.
42
43- error_inject
44 Write any integer to this file to trigger the error
45 injection. Before this, please specify all necessary error
46 parameters.
47
48- param1
49 This file is used to set the first error parameter value. Effect of
50 parameter depends on error_type specified. For memory error, this is
51 physical memory address.
52
53- param2
54 This file is used to set the second error parameter value. Effect of
55 parameter depends on error_type specified. For memory error, this is
56 physical memory address mask.
57
58For more information about EINJ, please refer to ACPI specification
59version 4.0, section 17.5.
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index 82e418d648d0..7f5fc3ba9c91 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -20,6 +20,8 @@ Samsung-S3C24XX
20 - S3C24XX ARM Linux Overview 20 - S3C24XX ARM Linux Overview
21Sharp-LH 21Sharp-LH
22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) 22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
23SPEAr
24 - ST SPEAr platform Linux Overview
23VFP/ 25VFP/
24 - Release notes for Linux Kernel Vector Floating Point support code 26 - Release notes for Linux Kernel Vector Floating Point support code
25empeg/ 27empeg/
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
index 7197a9e958ee..f9f62e8c0719 100644
--- a/Documentation/arm/SA1100/ADSBitsy
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -32,7 +32,7 @@ Notes:
32 32
33- The flash on board is divided into 3 partitions. 33- The flash on board is divided into 3 partitions.
34 You should be careful to use flash on board. 34 You should be careful to use flash on board.
35 It's partition is different from GraphicsClient Plus and GraphicsMaster 35 Its partition is different from GraphicsClient Plus and GraphicsMaster
36 36
37- 16bpp mode requires a different cable than what ships with the board. 37- 16bpp mode requires a different cable than what ships with the board.
38 Contact ADS or look through the manual to wire your own. Currently, 38 Contact ADS or look through the manual to wire your own. Currently,
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
new file mode 100644
index 000000000000..253a35c6f782
--- /dev/null
+++ b/Documentation/arm/SPEAr/overview.txt
@@ -0,0 +1,60 @@
1 SPEAr ARM Linux Overview
2 ==========================
3
4Introduction
5------------
6
7 SPEAr (Structured Processor Enhanced Architecture).
8 weblink : http://www.st.com/spear
9
10 The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
11 supported by the 'spear' platform of ARM Linux. Currently SPEAr300,
12 SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX
13 series is in progress.
14
15 Hierarchy in SPEAr is as follows:
16
17 SPEAr (Platform)
18 - SPEAr3XX (3XX SOC series, based on ARM9)
19 - SPEAr300 (SOC)
20 - SPEAr300_EVB (Evaluation Board)
21 - SPEAr310 (SOC)
22 - SPEAr310_EVB (Evaluation Board)
23 - SPEAr320 (SOC)
24 - SPEAr320_EVB (Evaluation Board)
25 - SPEAr6XX (6XX SOC series, based on ARM9)
26 - SPEAr600 (SOC)
27 - SPEAr600_EVB (Evaluation Board)
28 - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
29 - SPEAr1300 (SOC)
30
31 Configuration
32 -------------
33
34 A generic configuration is provided for each machine, and can be used as the
35 default by
36 make spear600_defconfig
37 make spear300_defconfig
38 make spear310_defconfig
39 make spear320_defconfig
40
41 Layout
42 ------
43
44 The common files for multiple machine families (SPEAr3XX, SPEAr6XX and
45 SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear
46 with headers in plat/.
47
48 Each machine series have a directory with name arch/arm/mach-spear followed by
49 series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
50
51 Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for
52 spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine
53 specific files, like spear300.c, spear310.c, spear320.c and spear600.c.
54 mach-spear* also contains board specific files for each machine type.
55
56
57 Document Author
58 ---------------
59
60 Viresh Kumar, (c) 2010 ST Microelectronics
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
index 2af2cf39915f..816d6071669e 100644
--- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt
+++ b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
@@ -12,6 +12,8 @@ Introduction
12 of the s3c2410 GPIO system, please read the Samsung provided 12 of the s3c2410 GPIO system, please read the Samsung provided
13 data-sheet/users manual to find out the complete list. 13 data-sheet/users manual to find out the complete list.
14 14
15 See Documentation/arm/Samsung/GPIO.txt for the core implemetation.
16
15 17
16GPIOLIB 18GPIOLIB
17------- 19-------
@@ -24,8 +26,60 @@ GPIOLIB
24 listed below will be removed (they may be marked as __deprecated 26 listed below will be removed (they may be marked as __deprecated
25 in the near future). 27 in the near future).
26 28
27 - s3c2410_gpio_getpin 29 The following functions now either have a s3c_ specific variant
28 - s3c2410_gpio_setpin 30 or are merged into gpiolib. See the definitions in
31 arch/arm/plat-samsung/include/plat/gpio-cfg.h:
32
33 s3c2410_gpio_setpin() gpio_set_value() or gpio_direction_output()
34 s3c2410_gpio_getpin() gpio_get_value() or gpio_direction_input()
35 s3c2410_gpio_getirq() gpio_to_irq()
36 s3c2410_gpio_cfgpin() s3c_gpio_cfgpin()
37 s3c2410_gpio_getcfg() s3c_gpio_getcfg()
38 s3c2410_gpio_pullup() s3c_gpio_setpull()
39
40
41GPIOLIB conversion
42------------------
43
44If you need to convert your board or driver to use gpiolib from the exiting
45s3c2410 api, then here are some notes on the process.
46
471) If your board is exclusively using an GPIO, say to control peripheral
48 power, then it will require to claim the gpio with gpio_request() before
49 it can use it.
50
51 It is recommended to check the return value, with at least WARN_ON()
52 during initialisation.
53
542) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
55 as they have the same arguments, and can either take the pin specific
56 values, or the more generic special-function-number arguments.
57
583) s3c2410_gpio_pullup() changs have the problem that whilst the
59 s3c2410_gpio_pullup(x, 1) can be easily translated to the
60 s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
61 are not so easy.
62
63 The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
64 of some of the devices, a pull-down) and as such the new API distinguishes
65 between the UP and DOWN case. There is currently no 'just turn on' setting
66 which may be required if this becomes a problem.
67
684) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
69 does not implicitly configure the relevant gpio to output. The gpio
70 direction should be changed before using gpio_set_value().
71
725) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
73 has been set to input. It is currently unknown what the behaviour is
74 when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
75 would return the value the pin is supposed to be outputting).
76
776) s3c2410_gpio_getirq() should be directly replacable with the
78 gpio_to_irq() call.
79
80The s3c2410_gpio and gpio_ calls have always operated on the same gpio
81numberspace, so there is no problem with converting the gpio numbering
82between the calls.
29 83
30 84
31Headers 85Headers
@@ -54,6 +108,11 @@ PIN Numbers
54 eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell 108 eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
55 the GPIO functions which pin is to be used. 109 the GPIO functions which pin is to be used.
56 110
111 With the conversion to gpiolib, there is no longer a direct conversion
112 from gpio pin number to register base address as in earlier kernels. This
113 is due to the number space required for newer SoCs where the later
114 GPIOs are not contiguous.
115
57 116
58Configuring a pin 117Configuring a pin
59----------------- 118-----------------
@@ -71,6 +130,8 @@ Configuring a pin
71 which would turn GPA(0) into the lowest Address line A0, and set 130 which would turn GPA(0) into the lowest Address line A0, and set
72 GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line. 131 GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
73 132
133 The s3c_gpio_cfgpin() call is a functional replacement for this call.
134
74 135
75Reading the current configuration 136Reading the current configuration
76--------------------------------- 137---------------------------------
@@ -82,6 +143,9 @@ Reading the current configuration
82 The return value will be from the same set of values which can be 143 The return value will be from the same set of values which can be
83 passed to s3c2410_gpio_cfgpin(). 144 passed to s3c2410_gpio_cfgpin().
84 145
146 The s3c_gpio_getcfg() call should be a functional replacement for
147 this call.
148
85 149
86Configuring a pull-up resistor 150Configuring a pull-up resistor
87------------------------------ 151------------------------------
@@ -95,6 +159,10 @@ Configuring a pull-up resistor
95 Where the to value is zero to set the pull-up off, and 1 to enable 159 Where the to value is zero to set the pull-up off, and 1 to enable
96 the specified pull-up. Any other values are currently undefined. 160 the specified pull-up. Any other values are currently undefined.
97 161
162 The s3c_gpio_setpull() offers similar functionality, but with the
163 ability to encode whether the pull is up or down. Currently there
164 is no 'just on' state, so up or down must be selected.
165
98 166
99Getting the state of a PIN 167Getting the state of a PIN
100-------------------------- 168--------------------------
@@ -106,6 +174,9 @@ Getting the state of a PIN
106 This will return either zero or non-zero. Do not count on this 174 This will return either zero or non-zero. Do not count on this
107 function returning 1 if the pin is set. 175 function returning 1 if the pin is set.
108 176
177 This call is now implemented by the relevant gpiolib calls, convert
178 your board or driver to use gpiolib.
179
109 180
110Setting the state of a PIN 181Setting the state of a PIN
111-------------------------- 182--------------------------
@@ -117,6 +188,9 @@ Setting the state of a PIN
117 Which sets the given pin to the value. Use 0 to write 0, and 1 to 188 Which sets the given pin to the value. Use 0 to write 0, and 1 to
118 set the output to 1. 189 set the output to 1.
119 190
191 This call is now implemented by the relevant gpiolib calls, convert
192 your board or driver to use gpiolib.
193
120 194
121Getting the IRQ number associated with a PIN 195Getting the IRQ number associated with a PIN
122-------------------------------------------- 196--------------------------------------------
@@ -128,6 +202,9 @@ Getting the IRQ number associated with a PIN
128 202
129 Note, not all pins have an IRQ. 203 Note, not all pins have an IRQ.
130 204
205 This call is now implemented by the relevant gpiolib calls, convert
206 your board or driver to use gpiolib.
207
131 208
132Authour 209Authour
133------- 210-------
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt
index 081892df4fda..c12bfc1a00c9 100644
--- a/Documentation/arm/Samsung-S3C24XX/Overview.txt
+++ b/Documentation/arm/Samsung-S3C24XX/Overview.txt
@@ -8,10 +8,16 @@ Introduction
8 8
9 The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported 9 The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
10 by the 's3c2410' architecture of ARM Linux. Currently the S3C2410, 10 by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
11 S3C2412, S3C2413, S3C2440, S3C2442 and S3C2443 devices are supported. 11 S3C2412, S3C2413, S3C2416 S3C2440, S3C2442, S3C2443 and S3C2450 devices
12 are supported.
12 13
13 Support for the S3C2400 and S3C24A0 series are in progress. 14 Support for the S3C2400 and S3C24A0 series are in progress.
14 15
16 The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
17 included under the arch/arm/mach-s3c2416 directory. Note, whilst core
18 support for these SoCs is in, work on some of the extra peripherals
19 and extra interrupts is still ongoing.
20
15 21
16Configuration 22Configuration
17------------- 23-------------
@@ -209,6 +215,13 @@ GPIO
209 Newer kernels carry GPIOLIB, and support is being moved towards 215 Newer kernels carry GPIOLIB, and support is being moved towards
210 this with some of the older support in line to be removed. 216 this with some of the older support in line to be removed.
211 217
218 As of v2.6.34, the move towards using gpiolib support is almost
219 complete, and very little of the old calls are left.
220
221 See Documentation/arm/Samsung-S3C24XX/GPIO.txt for the S3C24XX specific
222 support and Documentation/arm/Samsung/GPIO.txt for the core Samsung
223 implementation.
224
212 225
213Clock Management 226Clock Management
214---------------- 227----------------
diff --git a/Documentation/arm/Samsung/GPIO.txt b/Documentation/arm/Samsung/GPIO.txt
new file mode 100644
index 000000000000..05850c62abeb
--- /dev/null
+++ b/Documentation/arm/Samsung/GPIO.txt
@@ -0,0 +1,42 @@
1 Samsung GPIO implementation
2 ===========================
3
4Introduction
5------------
6
7This outlines the Samsung GPIO implementation and the architecture
8specfic calls provided alongisde the drivers/gpio core.
9
10
11S3C24XX (Legacy)
12----------------
13
14See Documentation/arm/Samsung-S3C24XX/GPIO.txt for more information
15about these devices. Their implementation is being brought into line
16with the core samsung implementation described in this document.
17
18
19GPIOLIB integration
20-------------------
21
22The gpio implementation uses gpiolib as much as possible, only providing
23specific calls for the items that require Samsung specific handling, such
24as pin special-function or pull resistor control.
25
26GPIO numbering is synchronised between the Samsung and gpiolib system.
27
28
29PIN configuration
30-----------------
31
32Pin configuration is specific to the Samsung architecutre, with each SoC
33registering the necessary information for the core gpio configuration
34implementation to configure pins as necessary.
35
36The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a
37driver or machine to change gpio configuration.
38
39See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information
40on these functions.
41
42
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt
index 7cced1fea9c3..c3094ea51aa7 100644
--- a/Documentation/arm/Samsung/Overview.txt
+++ b/Documentation/arm/Samsung/Overview.txt
@@ -13,9 +13,10 @@ Introduction
13 13
14 - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list 14 - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
15 - S3C64XX: S3C6400 and S3C6410 15 - S3C64XX: S3C6400 and S3C6410
16 - S5PC6440 16 - S5P6440
17 17 - S5P6442
18 S5PC100 and S5PC110 support is currently being merged 18 - S5PC100
19 - S5PC110 / S5PV210
19 20
20 21
21S3C24XX Systems 22S3C24XX Systems
@@ -35,7 +36,10 @@ Configuration
35 unifying all the SoCs into one kernel. 36 unifying all the SoCs into one kernel.
36 37
37 s5p6440_defconfig - S5P6440 specific default configuration 38 s5p6440_defconfig - S5P6440 specific default configuration
39 s5p6442_defconfig - S5P6442 specific default configuration
38 s5pc100_defconfig - S5PC100 specific default configuration 40 s5pc100_defconfig - S5PC100 specific default configuration
41 s5pc110_defconfig - S5PC110 specific default configuration
42 s5pv210_defconfig - S5PV210 specific default configuration
39 43
40 44
41Layout 45Layout
@@ -50,18 +54,27 @@ Layout
50 specific information. It contains the base clock, GPIO and device definitions 54 specific information. It contains the base clock, GPIO and device definitions
51 to get the system running. 55 to get the system running.
52 56
53 plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently
54 involved in other builds this will be phased out once the relevant code is
55 moved elsewhere.
56
57 plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs. 57 plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
58 58
59 plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs. 59 plat-s5p is for s5p specific builds, and contains common support for the
60 S5P specific systems. Not all S5Ps use all the features in this directory
61 due to differences in the hardware.
62
63
64Layout changes
65--------------
66
67 The old plat-s3c and plat-s5pc1xx directories have been removed, with
68 support moved to either plat-samsung or plat-s5p as necessary. These moves
69 where to simplify the include and dependency issues involved with having
70 so many different platform directories.
60 71
61 plat-s5p is for s5p specific builds, more to be added. 72 It was decided to remove plat-s5pc1xx as some of the support was already
73 in plat-s5p or plat-samsung, with the S5PC110 support added with S5PV210
74 the only user was the S5PC100. The S5PC100 specific items where moved to
75 arch/arm/mach-s5pc100.
62 76
63 77
64 [ to finish ]
65 78
66 79
67Port Contributors 80Port Contributors
diff --git a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
index 1e6a23fdf2fc..dc460f055647 100644
--- a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
+++ b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
@@ -7,7 +7,7 @@ The driver only implements a four-wire touch panel protocol.
7 7
8The touchscreen driver is maintenance free except for the pen-down or 8The touchscreen driver is maintenance free except for the pen-down or
9touch threshold. Some resistive displays and board combinations may 9touch threshold. Some resistive displays and board combinations may
10require tuning of this threshold. The driver exposes some of it's 10require tuning of this threshold. The driver exposes some of its
11internal state in the sys filesystem. If the kernel is configured 11internal state in the sys filesystem. If the kernel is configured
12with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a 12with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a
13directory 13directory
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 396bec3b74ed..ac4d47187122 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -320,7 +320,7 @@ counter decrement would not become globally visible until the
320obj->active update does. 320obj->active update does.
321 321
322As a historical note, 32-bit Sparc used to only allow usage of 322As a historical note, 32-bit Sparc used to only allow usage of
32324-bits of it's atomic_t type. This was because it used 8 bits 32324-bits of its atomic_t type. This was because it used 8 bits
324as a spinlock for SMP safety. Sparc32 lacked a "compare and swap" 324as a spinlock for SMP safety. Sparc32 lacked a "compare and swap"
325type instruction. However, 32-bit Sparc has since been moved over 325type instruction. However, 32-bit Sparc has since been moved over
326to a "hash table of spinlocks" scheme, that allows the full 32-bit 326to a "hash table of spinlocks" scheme, that allows the full 32-bit
diff --git a/Documentation/blackfin/bfin-gpio-notes.txt b/Documentation/blackfin/bfin-gpio-notes.txt
index 9898c7ded7d3..f731c1e56475 100644
--- a/Documentation/blackfin/bfin-gpio-notes.txt
+++ b/Documentation/blackfin/bfin-gpio-notes.txt
@@ -43,7 +43,7 @@
43 void bfin_gpio_irq_free(unsigned gpio); 43 void bfin_gpio_irq_free(unsigned gpio);
44 44
45 The request functions will record the function state for a certain pin, 45 The request functions will record the function state for a certain pin,
46 the free functions will clear it's function state. 46 the free functions will clear its function state.
47 Once a pin is requested, it can't be requested again before it is freed by 47 Once a pin is requested, it can't be requested again before it is freed by
48 previous caller, otherwise kernel will dump stacks, and the request 48 previous caller, otherwise kernel will dump stacks, and the request
49 function fail. 49 function fail.
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 2b5f823abd03..9164ae3b83bc 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -5,7 +5,7 @@
5 5
6This document describes the cache/tlb flushing interfaces called 6This document describes the cache/tlb flushing interfaces called
7by the Linux VM subsystem. It enumerates over each interface, 7by the Linux VM subsystem. It enumerates over each interface,
8describes it's intended purpose, and what side effect is expected 8describes its intended purpose, and what side effect is expected
9after the interface is invoked. 9after the interface is invoked.
10 10
11The side effects described below are stated for a uniprocessor 11The side effects described below are stated for a uniprocessor
@@ -231,7 +231,7 @@ require a whole different set of interfaces to handle properly.
231The biggest problem is that of virtual aliasing in the data cache 231The biggest problem is that of virtual aliasing in the data cache
232of a processor. 232of a processor.
233 233
234Is your port susceptible to virtual aliasing in it's D-cache? 234Is your port susceptible to virtual aliasing in its D-cache?
235Well, if your D-cache is virtually indexed, is larger in size than 235Well, if your D-cache is virtually indexed, is larger in size than
236PAGE_SIZE, and does not prevent multiple cache lines for the same 236PAGE_SIZE, and does not prevent multiple cache lines for the same
237physical address from existing at once, you have this problem. 237physical address from existing at once, you have this problem.
@@ -249,7 +249,7 @@ one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
249Next, you have to solve the D-cache aliasing issue for all 249Next, you have to solve the D-cache aliasing issue for all
250other cases. Please keep in mind that fact that, for a given page 250other cases. Please keep in mind that fact that, for a given page
251mapped into some user address space, there is always at least one more 251mapped into some user address space, there is always at least one more
252mapping, that of the kernel in it's linear mapping starting at 252mapping, that of the kernel in its linear mapping starting at
253PAGE_OFFSET. So immediately, once the first user maps a given 253PAGE_OFFSET. So immediately, once the first user maps a given
254physical page into its address space, by implication the D-cache 254physical page into its address space, by implication the D-cache
255aliasing problem has the potential to exist since the kernel already 255aliasing problem has the potential to exist since the kernel already
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
17You can do a very simple testing of running two dd threads in two different 17You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do. 18cgroups. Here is what you can do.
19 19
20- Enable Block IO controller
21 CONFIG_BLK_CGROUP=y
22
20- Enable group scheduling in CFQ 23- Enable group scheduling in CFQ
21 CONFIG_CFQ_GROUP_IOSCHED=y 24 CONFIG_CFQ_GROUP_IOSCHED=y
22 25
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
54 57
55Various user visible config options 58Various user visible config options
56=================================== 59===================================
57CONFIG_CFQ_GROUP_IOSCHED
58 - Enables group scheduling in CFQ. Currently only 1 level of group
59 creation is allowed.
60
61CONFIG_DEBUG_CFQ_IOSCHED
62 - Enables some debugging messages in blktrace. Also creates extra
63 cgroup file blkio.dequeue.
64
65Config options selected automatically
66=====================================
67These config options are not user visible and are selected/deselected
68automatically based on IO scheduler configuration.
69
70CONFIG_BLK_CGROUP 60CONFIG_BLK_CGROUP
71 - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. 61 - Block IO controller.
72 62
73CONFIG_DEBUG_BLK_CGROUP 63CONFIG_DEBUG_BLK_CGROUP
74 - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. 64 - Debug help. Right now some additional stats file show up in cgroup
65 if this option is enabled.
66
67CONFIG_CFQ_GROUP_IOSCHED
68 - Enables group scheduling in CFQ. Currently only 1 level of group
69 creation is allowed.
75 70
76Details of cgroup files 71Details of cgroup files
77======================= 72=======================
78- blkio.weight 73- blkio.weight
79 - Specifies per cgroup weight. 74 - Specifies per cgroup weight. This is default weight of the group
80 75 on all the devices until and unless overridden by per device rule.
76 (See blkio.weight_device).
81 Currently allowed range of weights is from 100 to 1000. 77 Currently allowed range of weights is from 100 to 1000.
82 78
79- blkio.weight_device
80 - One can specify per cgroup per device rules using this interface.
81 These rules override the default value of group weight as specified
82 by blkio.weight.
83
84 Following is the format.
85
86 #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
87 Configure weight=300 on /dev/sdb (8:16) in this cgroup
88 # echo 8:16 300 > blkio.weight_device
89 # cat blkio.weight_device
90 dev weight
91 8:16 300
92
93 Configure weight=500 on /dev/sda (8:0) in this cgroup
94 # echo 8:0 500 > blkio.weight_device
95 # cat blkio.weight_device
96 dev weight
97 8:0 500
98 8:16 300
99
100 Remove specific weight for /dev/sda in this cgroup
101 # echo 8:0 0 > blkio.weight_device
102 # cat blkio.weight_device
103 dev weight
104 8:16 300
105
83- blkio.time 106- blkio.time
84 - disk time allocated to cgroup per device in milliseconds. First 107 - disk time allocated to cgroup per device in milliseconds. First
85 two fields specify the major and minor number of the device and 108 two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
92 third field specifies the number of sectors transferred by the 115 third field specifies the number of sectors transferred by the
93 group to/from the device. 116 group to/from the device.
94 117
118- blkio.io_service_bytes
119 - Number of bytes transferred to/from the disk by the group. These
120 are further divided by the type of operation - read or write, sync
121 or async. First two fields specify the major and minor number of the
122 device, third field specifies the operation type and the fourth field
123 specifies the number of bytes.
124
125- blkio.io_serviced
126 - Number of IOs completed to/from the disk by the group. These
127 are further divided by the type of operation - read or write, sync
128 or async. First two fields specify the major and minor number of the
129 device, third field specifies the operation type and the fourth field
130 specifies the number of IOs.
131
132- blkio.io_service_time
133 - Total amount of time between request dispatch and request completion
134 for the IOs done by this cgroup. This is in nanoseconds to make it
135 meaningful for flash devices too. For devices with queue depth of 1,
136 this time represents the actual service time. When queue_depth > 1,
137 that is no longer true as requests may be served out of order. This
138 may cause the service time for a given IO to include the service time
139 of multiple IOs when served out of order which may result in total
140 io_service_time > actual time elapsed. This time is further divided by
141 the type of operation - read or write, sync or async. First two fields
142 specify the major and minor number of the device, third field
143 specifies the operation type and the fourth field specifies the
144 io_service_time in ns.
145
146- blkio.io_wait_time
147 - Total amount of time the IOs for this cgroup spent waiting in the
148 scheduler queues for service. This can be greater than the total time
149 elapsed since it is cumulative io_wait_time for all IOs. It is not a
150 measure of total time the cgroup spent waiting but rather a measure of
151 the wait_time for its individual IOs. For devices with queue_depth > 1
152 this metric does not include the time spent waiting for service once
153 the IO is dispatched to the device but till it actually gets serviced
154 (there might be a time lag here due to re-ordering of requests by the
155 device). This is in nanoseconds to make it meaningful for flash
156 devices too. This time is further divided by the type of operation -
157 read or write, sync or async. First two fields specify the major and
158 minor number of the device, third field specifies the operation type
159 and the fourth field specifies the io_wait_time in ns.
160
161- blkio.io_merged
162 - Total number of bios/requests merged into requests belonging to this
163 cgroup. This is further divided by the type of operation - read or
164 write, sync or async.
165
166- blkio.io_queued
167 - Total number of requests queued up at any given instant for this
168 cgroup. This is further divided by the type of operation - read or
169 write, sync or async.
170
171- blkio.avg_queue_size
172 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
173 The average queue size for this cgroup over the entire time of this
174 cgroup's existence. Queue size samples are taken each time one of the
175 queues of this cgroup gets a timeslice.
176
177- blkio.group_wait_time
178 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
179 This is the amount of time the cgroup had to wait since it became busy
180 (i.e., went from 0 to 1 request queued) to get a timeslice for one of
181 its queues. This is different from the io_wait_time which is the
182 cumulative total of the amount of time spent by each IO in that cgroup
183 waiting in the scheduler queue. This is in nanoseconds. If this is
184 read when the cgroup is in a waiting (for timeslice) state, the stat
185 will only report the group_wait_time accumulated till the last time it
186 got a timeslice and will not include the current delta.
187
188- blkio.empty_time
189 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
190 This is the amount of time a cgroup spends without any pending
191 requests when not being served, i.e., it does not include any time
192 spent idling for one of the queues of the cgroup. This is in
193 nanoseconds. If this is read when the cgroup is in an empty state,
194 the stat will only report the empty_time accumulated till the last
195 time it had a pending request and will not include the current delta.
196
197- blkio.idle_time
198 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
199 This is the amount of time spent by the IO scheduler idling for a
200 given cgroup in anticipation of a better request than the exising ones
201 from other queues/cgroups. This is in nanoseconds. If this is read
202 when the cgroup is in an idling state, the stat will only report the
203 idle_time accumulated till the last idle period and will not include
204 the current delta.
205
95- blkio.dequeue 206- blkio.dequeue
96 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 207 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
97 gives the statistics about how many a times a group was dequeued 208 gives the statistics about how many a times a group was dequeued
98 from service tree of the device. First two fields specify the major 209 from service tree of the device. First two fields specify the major
99 and minor number of the device and third field specifies the number 210 and minor number of the device and third field specifies the number
100 of times a group was dequeued from a particular device. 211 of times a group was dequeued from a particular device.
101 212
213- blkio.reset_stats
214 - Writing an int to this file will result in resetting all the stats
215 for that cgroup.
216
102CFQ sysfs tunable 217CFQ sysfs tunable
103================= 218=================
104/sys/block/<disk>/queue/iosched/group_isolation 219/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index a1ca5924faff..b34823ff1646 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -339,7 +339,7 @@ To mount a cgroup hierarchy with all available subsystems, type:
339The "xxx" is not interpreted by the cgroup code, but will appear in 339The "xxx" is not interpreted by the cgroup code, but will appear in
340/proc/mounts so may be any useful identifying string that you like. 340/proc/mounts so may be any useful identifying string that you like.
341 341
342To mount a cgroup hierarchy with just the cpuset and numtasks 342To mount a cgroup hierarchy with just the cpuset and memory
343subsystems, type: 343subsystems, type:
344# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup 344# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup
345 345
@@ -572,7 +572,7 @@ void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
572 572
573Called when a task attach operation has failed after can_attach() has succeeded. 573Called when a task attach operation has failed after can_attach() has succeeded.
574A subsystem whose can_attach() has some side-effects should provide this 574A subsystem whose can_attach() has some side-effects should provide this
575function, so that the subsytem can implement a rollback. If not, not necessary. 575function, so that the subsystem can implement a rollback. If not, not necessary.
576This will be called only about subsystems whose can_attach() operation have 576This will be called only about subsystems whose can_attach() operation have
577succeeded. 577succeeded.
578 578
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 4160df82b3f5..51682ab2dd1a 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -42,7 +42,7 @@ Nodes to a set of tasks. In this document "Memory Node" refers to
42an on-line node that contains memory. 42an on-line node that contains memory.
43 43
44Cpusets constrain the CPU and Memory placement of tasks to only 44Cpusets constrain the CPU and Memory placement of tasks to only
45the resources within a tasks current cpuset. They form a nested 45the resources within a task's current cpuset. They form a nested
46hierarchy visible in a virtual file system. These are the essential 46hierarchy visible in a virtual file system. These are the essential
47hooks, beyond what is already present, required to manage dynamic 47hooks, beyond what is already present, required to manage dynamic
48job placement on large systems. 48job placement on large systems.
@@ -53,11 +53,11 @@ Documentation/cgroups/cgroups.txt.
53Requests by a task, using the sched_setaffinity(2) system call to 53Requests by a task, using the sched_setaffinity(2) system call to
54include CPUs in its CPU affinity mask, and using the mbind(2) and 54include CPUs in its CPU affinity mask, and using the mbind(2) and
55set_mempolicy(2) system calls to include Memory Nodes in its memory 55set_mempolicy(2) system calls to include Memory Nodes in its memory
56policy, are both filtered through that tasks cpuset, filtering out any 56policy, are both filtered through that task's cpuset, filtering out any
57CPUs or Memory Nodes not in that cpuset. The scheduler will not 57CPUs or Memory Nodes not in that cpuset. The scheduler will not
58schedule a task on a CPU that is not allowed in its cpus_allowed 58schedule a task on a CPU that is not allowed in its cpus_allowed
59vector, and the kernel page allocator will not allocate a page on a 59vector, and the kernel page allocator will not allocate a page on a
60node that is not allowed in the requesting tasks mems_allowed vector. 60node that is not allowed in the requesting task's mems_allowed vector.
61 61
62User level code may create and destroy cpusets by name in the cgroup 62User level code may create and destroy cpusets by name in the cgroup
63virtual file system, manage the attributes and permissions of these 63virtual file system, manage the attributes and permissions of these
@@ -121,9 +121,9 @@ Cpusets extends these two mechanisms as follows:
121 - Each task in the system is attached to a cpuset, via a pointer 121 - Each task in the system is attached to a cpuset, via a pointer
122 in the task structure to a reference counted cgroup structure. 122 in the task structure to a reference counted cgroup structure.
123 - Calls to sched_setaffinity are filtered to just those CPUs 123 - Calls to sched_setaffinity are filtered to just those CPUs
124 allowed in that tasks cpuset. 124 allowed in that task's cpuset.
125 - Calls to mbind and set_mempolicy are filtered to just 125 - Calls to mbind and set_mempolicy are filtered to just
126 those Memory Nodes allowed in that tasks cpuset. 126 those Memory Nodes allowed in that task's cpuset.
127 - The root cpuset contains all the systems CPUs and Memory 127 - The root cpuset contains all the systems CPUs and Memory
128 Nodes. 128 Nodes.
129 - For any cpuset, one can define child cpusets containing a subset 129 - For any cpuset, one can define child cpusets containing a subset
@@ -141,11 +141,11 @@ into the rest of the kernel, none in performance critical paths:
141 - in init/main.c, to initialize the root cpuset at system boot. 141 - in init/main.c, to initialize the root cpuset at system boot.
142 - in fork and exit, to attach and detach a task from its cpuset. 142 - in fork and exit, to attach and detach a task from its cpuset.
143 - in sched_setaffinity, to mask the requested CPUs by what's 143 - in sched_setaffinity, to mask the requested CPUs by what's
144 allowed in that tasks cpuset. 144 allowed in that task's cpuset.
145 - in sched.c migrate_live_tasks(), to keep migrating tasks within 145 - in sched.c migrate_live_tasks(), to keep migrating tasks within
146 the CPUs allowed by their cpuset, if possible. 146 the CPUs allowed by their cpuset, if possible.
147 - in the mbind and set_mempolicy system calls, to mask the requested 147 - in the mbind and set_mempolicy system calls, to mask the requested
148 Memory Nodes by what's allowed in that tasks cpuset. 148 Memory Nodes by what's allowed in that task's cpuset.
149 - in page_alloc.c, to restrict memory to allowed nodes. 149 - in page_alloc.c, to restrict memory to allowed nodes.
150 - in vmscan.c, to restrict page recovery to the current cpuset. 150 - in vmscan.c, to restrict page recovery to the current cpuset.
151 151
@@ -155,7 +155,7 @@ new system calls are added for cpusets - all support for querying and
155modifying cpusets is via this cpuset file system. 155modifying cpusets is via this cpuset file system.
156 156
157The /proc/<pid>/status file for each task has four added lines, 157The /proc/<pid>/status file for each task has four added lines,
158displaying the tasks cpus_allowed (on which CPUs it may be scheduled) 158displaying the task's cpus_allowed (on which CPUs it may be scheduled)
159and mems_allowed (on which Memory Nodes it may obtain memory), 159and mems_allowed (on which Memory Nodes it may obtain memory),
160in the two formats seen in the following example: 160in the two formats seen in the following example:
161 161
@@ -323,17 +323,17 @@ stack segment pages of a task.
323 323
324By default, both kinds of memory spreading are off, and memory 324By default, both kinds of memory spreading are off, and memory
325pages are allocated on the node local to where the task is running, 325pages are allocated on the node local to where the task is running,
326except perhaps as modified by the tasks NUMA mempolicy or cpuset 326except perhaps as modified by the task's NUMA mempolicy or cpuset
327configuration, so long as sufficient free memory pages are available. 327configuration, so long as sufficient free memory pages are available.
328 328
329When new cpusets are created, they inherit the memory spread settings 329When new cpusets are created, they inherit the memory spread settings
330of their parent. 330of their parent.
331 331
332Setting memory spreading causes allocations for the affected page 332Setting memory spreading causes allocations for the affected page
333or slab caches to ignore the tasks NUMA mempolicy and be spread 333or slab caches to ignore the task's NUMA mempolicy and be spread
334instead. Tasks using mbind() or set_mempolicy() calls to set NUMA 334instead. Tasks using mbind() or set_mempolicy() calls to set NUMA
335mempolicies will not notice any change in these calls as a result of 335mempolicies will not notice any change in these calls as a result of
336their containing tasks memory spread settings. If memory spreading 336their containing task's memory spread settings. If memory spreading
337is turned off, then the currently specified NUMA mempolicy once again 337is turned off, then the currently specified NUMA mempolicy once again
338applies to memory page allocations. 338applies to memory page allocations.
339 339
@@ -357,7 +357,7 @@ pages from the node returned by cpuset_mem_spread_node().
357 357
358The cpuset_mem_spread_node() routine is also simple. It uses the 358The cpuset_mem_spread_node() routine is also simple. It uses the
359value of a per-task rotor cpuset_mem_spread_rotor to select the next 359value of a per-task rotor cpuset_mem_spread_rotor to select the next
360node in the current tasks mems_allowed to prefer for the allocation. 360node in the current task's mems_allowed to prefer for the allocation.
361 361
362This memory placement policy is also known (in other contexts) as 362This memory placement policy is also known (in other contexts) as
363round-robin or interleave. 363round-robin or interleave.
@@ -594,7 +594,7 @@ is attached, is subtle.
594If a cpuset has its Memory Nodes modified, then for each task attached 594If a cpuset has its Memory Nodes modified, then for each task attached
595to that cpuset, the next time that the kernel attempts to allocate 595to that cpuset, the next time that the kernel attempts to allocate
596a page of memory for that task, the kernel will notice the change 596a page of memory for that task, the kernel will notice the change
597in the tasks cpuset, and update its per-task memory placement to 597in the task's cpuset, and update its per-task memory placement to
598remain within the new cpusets memory placement. If the task was using 598remain within the new cpusets memory placement. If the task was using
599mempolicy MPOL_BIND, and the nodes to which it was bound overlap with 599mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
600its new cpuset, then the task will continue to use whatever subset 600its new cpuset, then the task will continue to use whatever subset
@@ -603,13 +603,13 @@ was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
603in the new cpuset, then the task will be essentially treated as if it 603in the new cpuset, then the task will be essentially treated as if it
604was MPOL_BIND bound to the new cpuset (even though its NUMA placement, 604was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
605as queried by get_mempolicy(), doesn't change). If a task is moved 605as queried by get_mempolicy(), doesn't change). If a task is moved
606from one cpuset to another, then the kernel will adjust the tasks 606from one cpuset to another, then the kernel will adjust the task's
607memory placement, as above, the next time that the kernel attempts 607memory placement, as above, the next time that the kernel attempts
608to allocate a page of memory for that task. 608to allocate a page of memory for that task.
609 609
610If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset 610If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
611will have its allowed CPU placement changed immediately. Similarly, 611will have its allowed CPU placement changed immediately. Similarly,
612if a tasks pid is written to another cpusets 'cpuset.tasks' file, then its 612if a task's pid is written to another cpusets 'cpuset.tasks' file, then its
613allowed CPU placement is changed immediately. If such a task had been 613allowed CPU placement is changed immediately. If such a task had been
614bound to some subset of its cpuset using the sched_setaffinity() call, 614bound to some subset of its cpuset using the sched_setaffinity() call,
615the task will be allowed to run on any CPU allowed in its new cpuset, 615the task will be allowed to run on any CPU allowed in its new cpuset,
@@ -626,16 +626,16 @@ cpusets memory placement policy 'cpuset.mems' subsequently changes.
626If the cpuset flag file 'cpuset.memory_migrate' is set true, then when 626If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
627tasks are attached to that cpuset, any pages that task had 627tasks are attached to that cpuset, any pages that task had
628allocated to it on nodes in its previous cpuset are migrated 628allocated to it on nodes in its previous cpuset are migrated
629to the tasks new cpuset. The relative placement of the page within 629to the task's new cpuset. The relative placement of the page within
630the cpuset is preserved during these migration operations if possible. 630the cpuset is preserved during these migration operations if possible.
631For example if the page was on the second valid node of the prior cpuset 631For example if the page was on the second valid node of the prior cpuset
632then the page will be placed on the second valid node of the new cpuset. 632then the page will be placed on the second valid node of the new cpuset.
633 633
634Also if 'cpuset.memory_migrate' is set true, then if that cpusets 634Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
635'cpuset.mems' file is modified, pages allocated to tasks in that 635'cpuset.mems' file is modified, pages allocated to tasks in that
636cpuset, that were on nodes in the previous setting of 'cpuset.mems', 636cpuset, that were on nodes in the previous setting of 'cpuset.mems',
637will be moved to nodes in the new setting of 'mems.' 637will be moved to nodes in the new setting of 'mems.'
638Pages that were not in the tasks prior cpuset, or in the cpusets 638Pages that were not in the task's prior cpuset, or in the cpuset's
639prior 'cpuset.mems' setting, will not be moved. 639prior 'cpuset.mems' setting, will not be moved.
640 640
641There is an exception to the above. If hotplug functionality is used 641There is an exception to the above. If hotplug functionality is used
@@ -655,7 +655,7 @@ There is a second exception to the above. GFP_ATOMIC requests are
655kernel internal allocations that must be satisfied, immediately. 655kernel internal allocations that must be satisfied, immediately.
656The kernel may drop some request, in rare cases even panic, if a 656The kernel may drop some request, in rare cases even panic, if a
657GFP_ATOMIC alloc fails. If the request cannot be satisfied within 657GFP_ATOMIC alloc fails. If the request cannot be satisfied within
658the current tasks cpuset, then we relax the cpuset, and look for 658the current task's cpuset, then we relax the cpuset, and look for
659memory anywhere we can find it. It's better to violate the cpuset 659memory anywhere we can find it. It's better to violate the cpuset
660than stress the kernel. 660than stress the kernel.
661 661
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index f7f68b2ac199..b7eececfb195 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -244,7 +244,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
244 we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). 244 we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
245 245
2468. LRU 2468. LRU
247 Each memcg has its own private LRU. Now, it's handling is under global 247 Each memcg has its own private LRU. Now, its handling is under global
248 VM's control (means that it's handled under global zone->lru_lock). 248 VM's control (means that it's handled under global zone->lru_lock).
249 Almost all routines around memcg's LRU is called by global LRU's 249 Almost all routines around memcg's LRU is called by global LRU's
250 list management functions under zone->lru_lock(). 250 list management functions under zone->lru_lock().
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 3a6aecd078ba..7781857dc940 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,18 +1,15 @@
1Memory Resource Controller 1Memory Resource Controller
2 2
3NOTE: The Memory Resource Controller has been generically been referred 3NOTE: The Memory Resource Controller has been generically been referred
4to as the memory controller in this document. Do not confuse memory controller 4 to as the memory controller in this document. Do not confuse memory
5used here with the memory controller that is used in hardware. 5 controller used here with the memory controller that is used in hardware.
6 6
7Salient features 7(For editors)
8 8In this document:
9a. Enable control of Anonymous, Page Cache (mapped and unmapped) and 9 When we mention a cgroup (cgroupfs's directory) with memory controller,
10 Swap Cache memory pages. 10 we call it "memory cgroup". When you see git-log and source code, you'll
11b. The infrastructure allows easy addition of other types of memory to control 11 see patch's title and function names tend to use "memcg".
12c. Provides *zero overhead* for non memory controller users 12 In this document, we avoid using it.
13d. Provides a double LRU: global memory pressure causes reclaim from the
14 global LRU; a cgroup on hitting a limit, reclaims from the per
15 cgroup LRU
16 13
17Benefits and Purpose of the memory controller 14Benefits and Purpose of the memory controller
18 15
@@ -33,6 +30,45 @@ d. A CD/DVD burner could control the amount of memory used by the
33e. There are several other use cases, find one or use the controller just 30e. There are several other use cases, find one or use the controller just
34 for fun (to learn and hack on the VM subsystem). 31 for fun (to learn and hack on the VM subsystem).
35 32
33Current Status: linux-2.6.34-mmotm(development version of 2010/April)
34
35Features:
36 - accounting anonymous pages, file caches, swap caches usage and limiting them.
37 - private LRU and reclaim routine. (system's global LRU and private LRU
38 work independently from each other)
39 - optionally, memory+swap usage can be accounted and limited.
40 - hierarchical accounting
41 - soft limit
42 - moving(recharging) account at moving a task is selectable.
43 - usage threshold notifier
44 - oom-killer disable knob and oom-notifier
45 - Root cgroup has no limit controls.
46
47 Kernel memory and Hugepages are not under control yet. We just manage
48 pages on LRU. To add more controls, we have to take care of performance.
49
50Brief summary of control files.
51
52 tasks # attach a task(thread) and show list of threads
53 cgroup.procs # show list of processes
54 cgroup.event_control # an interface for event_fd()
55 memory.usage_in_bytes # show current memory(RSS+Cache) usage.
56 memory.memsw.usage_in_bytes # show current memory+Swap usage
57 memory.limit_in_bytes # set/show limit of memory usage
58 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
59 memory.failcnt # show the number of memory usage hits limits
60 memory.memsw.failcnt # show the number of memory+Swap hits limits
61 memory.max_usage_in_bytes # show max memory usage recorded
62 memory.memsw.usage_in_bytes # show max memory+Swap usage recorded
63 memory.soft_limit_in_bytes # set/show soft limit of memory usage
64 memory.stat # show various statistics
65 memory.use_hierarchy # set/show hierarchical account enabled
66 memory.force_empty # trigger forced move charge to parent
67 memory.swappiness # set/show swappiness parameter of vmscan
68 (See sysctl's vm.swappiness)
69 memory.move_charge_at_immigrate # set/show controls of moving charges
70 memory.oom_control # set/show oom controls.
71
361. History 721. History
37 73
38The memory controller has a long history. A request for comments for the memory 74The memory controller has a long history. A request for comments for the memory
@@ -106,14 +142,14 @@ the necessary data structures and check if the cgroup that is being charged
106is over its limit. If it is then reclaim is invoked on the cgroup. 142is over its limit. If it is then reclaim is invoked on the cgroup.
107More details can be found in the reclaim section of this document. 143More details can be found in the reclaim section of this document.
108If everything goes well, a page meta-data-structure called page_cgroup is 144If everything goes well, a page meta-data-structure called page_cgroup is
109allocated and associated with the page. This routine also adds the page to 145updated. page_cgroup has its own LRU on cgroup.
110the per cgroup LRU. 146(*) page_cgroup structure is allocated at boot/memory-hotplug time.
111 147
1122.2.1 Accounting details 1482.2.1 Accounting details
113 149
114All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. 150All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
115(some pages which never be reclaimable and will not be on global LRU 151Some pages which are never reclaimable and will not be on the global LRU
116 are not accounted. we just accounts pages under usual vm management.) 152are not accounted. We just account pages under usual VM management.
117 153
118RSS pages are accounted at page_fault unless they've already been accounted 154RSS pages are accounted at page_fault unless they've already been accounted
119for earlier. A file page will be accounted for as Page Cache when it's 155for earlier. A file page will be accounted for as Page Cache when it's
@@ -121,12 +157,19 @@ inserted into inode (radix-tree). While it's mapped into the page tables of
121processes, duplicate accounting is carefully avoided. 157processes, duplicate accounting is carefully avoided.
122 158
123A RSS page is unaccounted when it's fully unmapped. A PageCache page is 159A RSS page is unaccounted when it's fully unmapped. A PageCache page is
124unaccounted when it's removed from radix-tree. 160unaccounted when it's removed from radix-tree. Even if RSS pages are fully
161unmapped (by kswapd), they may exist as SwapCache in the system until they
162are really freed. Such SwapCaches also also accounted.
163A swapped-in page is not accounted until it's mapped.
164
165Note: The kernel does swapin-readahead and read multiple swaps at once.
166This means swapped-in pages may contain pages for other tasks than a task
167causing page fault. So, we avoid accounting at swap-in I/O.
125 168
126At page migration, accounting information is kept. 169At page migration, accounting information is kept.
127 170
128Note: we just account pages-on-lru because our purpose is to control amount 171Note: we just account pages-on-LRU because our purpose is to control amount
129of used pages. not-on-lru pages are tend to be out-of-control from vm view. 172of used pages; not-on-LRU pages tend to be out-of-control from VM view.
130 173
1312.3 Shared Page Accounting 1742.3 Shared Page Accounting
132 175
@@ -143,6 +186,7 @@ caller of swapoff rather than the users of shmem.
143 186
144 187
1452.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) 1882.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
189
146Swap Extension allows you to record charge for swap. A swapped-in page is 190Swap Extension allows you to record charge for swap. A swapped-in page is
147charged back to original page allocator if possible. 191charged back to original page allocator if possible.
148 192
@@ -150,13 +194,20 @@ When swap is accounted, following files are added.
150 - memory.memsw.usage_in_bytes. 194 - memory.memsw.usage_in_bytes.
151 - memory.memsw.limit_in_bytes. 195 - memory.memsw.limit_in_bytes.
152 196
153usage of mem+swap is limited by memsw.limit_in_bytes. 197memsw means memory+swap. Usage of memory+swap is limited by
198memsw.limit_in_bytes.
154 199
155* why 'mem+swap' rather than swap. 200Example: Assume a system with 4G of swap. A task which allocates 6G of memory
201(by mistake) under 2G memory limitation will use all swap.
202In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
203By using memsw limit, you can avoid system OOM which can be caused by swap
204shortage.
205
206* why 'memory+swap' rather than swap.
156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means 207The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
157to move account from memory to swap...there is no change in usage of 208to move account from memory to swap...there is no change in usage of
158mem+swap. In other words, when we want to limit the usage of swap without 209memory+swap. In other words, when we want to limit the usage of swap without
159affecting global LRU, mem+swap limit is better than just limiting swap from 210affecting global LRU, memory+swap limit is better than just limiting swap from
160OS point of view. 211OS point of view.
161 212
162* What happens when a cgroup hits memory.memsw.limit_in_bytes 213* What happens when a cgroup hits memory.memsw.limit_in_bytes
@@ -168,12 +219,12 @@ it by cgroup.
168 219
1692.5 Reclaim 2202.5 Reclaim
170 221
171Each cgroup maintains a per cgroup LRU that consists of an active 222Each cgroup maintains a per cgroup LRU which has the same structure as
172and inactive list. When a cgroup goes over its limit, we first try 223global VM. When a cgroup goes over its limit, we first try
173to reclaim memory from the cgroup so as to make space for the new 224to reclaim memory from the cgroup so as to make space for the new
174pages that the cgroup has touched. If the reclaim is unsuccessful, 225pages that the cgroup has touched. If the reclaim is unsuccessful,
175an OOM routine is invoked to select and kill the bulkiest task in the 226an OOM routine is invoked to select and kill the bulkiest task in the
176cgroup. 227cgroup. (See 10. OOM Control below.)
177 228
178The reclaim algorithm has not been modified for cgroups, except that 229The reclaim algorithm has not been modified for cgroups, except that
179pages that are selected for reclaiming come from the per cgroup LRU 230pages that are selected for reclaiming come from the per cgroup LRU
@@ -184,13 +235,22 @@ limits on the root cgroup.
184 235
185Note2: When panic_on_oom is set to "2", the whole system will panic. 236Note2: When panic_on_oom is set to "2", the whole system will panic.
186 237
1872. Locking 238When oom event notifier is registered, event will be delivered.
239(See oom_control section)
240
2412.6 Locking
188 242
189The memory controller uses the following hierarchy 243 lock_page_cgroup()/unlock_page_cgroup() should not be called under
244 mapping->tree_lock.
190 245
1911. zone->lru_lock is used for selecting pages to be isolated 246 Other lock order is following:
1922. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) 247 PG_locked.
1933. lock_page_cgroup() is used to protect page->page_cgroup 248 mm->page_table_lock
249 zone->lru_lock
250 lock_page_cgroup.
251 In many cases, just lock_page_cgroup() is called.
252 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
253 zone->lru_lock, it has no lock of its own.
194 254
1953. User Interface 2553. User Interface
196 256
@@ -199,6 +259,7 @@ The memory controller uses the following hierarchy
199a. Enable CONFIG_CGROUPS 259a. Enable CONFIG_CGROUPS
200b. Enable CONFIG_RESOURCE_COUNTERS 260b. Enable CONFIG_RESOURCE_COUNTERS
201c. Enable CONFIG_CGROUP_MEM_RES_CTLR 261c. Enable CONFIG_CGROUP_MEM_RES_CTLR
262d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension)
202 263
2031. Prepare the cgroups 2641. Prepare the cgroups
204# mkdir -p /cgroups 265# mkdir -p /cgroups
@@ -206,31 +267,28 @@ c. Enable CONFIG_CGROUP_MEM_RES_CTLR
206 267
2072. Make the new group and move bash into it 2682. Make the new group and move bash into it
208# mkdir /cgroups/0 269# mkdir /cgroups/0
209# echo $$ > /cgroups/0/tasks 270# echo $$ > /cgroups/0/tasks
210 271
211Since now we're in the 0 cgroup, 272Since now we're in the 0 cgroup, we can alter the memory limit:
212We can alter the memory limit:
213# echo 4M > /cgroups/0/memory.limit_in_bytes 273# echo 4M > /cgroups/0/memory.limit_in_bytes
214 274
215NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 275NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
216mega or gigabytes. 276mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
277
217NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 278NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
218NOTE: We cannot set limits on the root cgroup any more. 279NOTE: We cannot set limits on the root cgroup any more.
219 280
220# cat /cgroups/0/memory.limit_in_bytes 281# cat /cgroups/0/memory.limit_in_bytes
2214194304 2824194304
222 283
223NOTE: The interface has now changed to display the usage in bytes
224instead of pages
225
226We can check the usage: 284We can check the usage:
227# cat /cgroups/0/memory.usage_in_bytes 285# cat /cgroups/0/memory.usage_in_bytes
2281216512 2861216512
229 287
230A successful write to this file does not guarantee a successful set of 288A successful write to this file does not guarantee a successful set of
231this limit to the value written into the file. This can be due to a 289this limit to the value written into the file. This can be due to a
232number of factors, such as rounding up to page boundaries or the total 290number of factors, such as rounding up to page boundaries or the total
233availability of memory on the system. The user is required to re-read 291availability of memory on the system. The user is required to re-read
234this file after a write to guarantee the value committed by the kernel. 292this file after a write to guarantee the value committed by the kernel.
235 293
236# echo 1 > memory.limit_in_bytes 294# echo 1 > memory.limit_in_bytes
@@ -245,15 +303,23 @@ caches, RSS and Active pages/Inactive pages are shown.
245 303
2464. Testing 3044. Testing
247 305
248Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. 306For testing features and implementation, see memcg_test.txt.
249Apart from that v6 has been tested with several applications and regular 307
250daily use. The controller has also been tested on the PPC64, x86_64 and 308Performance test is also important. To see pure memory controller's overhead,
251UML platforms. 309testing on tmpfs will give you good numbers of small overheads.
310Example: do kernel make on tmpfs.
311
312Page-fault scalability is also important. At measuring parallel
313page fault test, multi-process test may be better than multi-thread
314test because it has noise of shared objects/status.
315
316But the above two are testing extreme situations.
317Trying usual test under memory controller is always helpful.
252 318
2534.1 Troubleshooting 3194.1 Troubleshooting
254 320
255Sometimes a user might find that the application under a cgroup is 321Sometimes a user might find that the application under a cgroup is
256terminated. There are several causes for this: 322terminated by OOM killer. There are several causes for this:
257 323
2581. The cgroup limit is too low (just too low to do anything useful) 3241. The cgroup limit is too low (just too low to do anything useful)
2592. The user is using anonymous memory and swap is turned off or too low 3252. The user is using anonymous memory and swap is turned off or too low
@@ -261,23 +327,29 @@ terminated. There are several causes for this:
261A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of 327A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
262some of the pages cached in the cgroup (page cache pages). 328some of the pages cached in the cgroup (page cache pages).
263 329
330To know what happens, disable OOM_Kill by 10. OOM Control(see below) and
331seeing what happens will be helpful.
332
2644.2 Task migration 3334.2 Task migration
265 334
266When a task migrates from one cgroup to another, it's charge is not 335When a task migrates from one cgroup to another, its charge is not
267carried forward by default. The pages allocated from the original cgroup still 336carried forward by default. The pages allocated from the original cgroup still
268remain charged to it, the charge is dropped when the page is freed or 337remain charged to it, the charge is dropped when the page is freed or
269reclaimed. 338reclaimed.
270 339
271Note: You can move charges of a task along with task migration. See 8. 340You can move charges of a task along with task migration.
341See 8. "Move charges at task migration"
272 342
2734.3 Removing a cgroup 3434.3 Removing a cgroup
274 344
275A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 345A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
276cgroup might have some charge associated with it, even though all 346cgroup might have some charge associated with it, even though all
277tasks have migrated away from it. 347tasks have migrated away from it. (because we charge against pages, not
278Such charges are freed(at default) or moved to its parent. When moved, 348against tasks.)
279both of RSS and CACHES are moved to parent. 349
280If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. 350Such charges are freed or moved to their parent. At moving, both of RSS
351and CACHES are moved to parent.
352rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also.
281 353
282Charges recorded in swap information is not updated at removal of cgroup. 354Charges recorded in swap information is not updated at removal of cgroup.
283Recorded information is discarded and a cgroup which uses swap (swapcache) 355Recorded information is discarded and a cgroup which uses swap (swapcache)
@@ -293,10 +365,10 @@ will be charged as a new owner of it.
293 365
294 # echo 0 > memory.force_empty 366 # echo 0 > memory.force_empty
295 367
296 Almost all pages tracked by this memcg will be unmapped and freed. Some of 368 Almost all pages tracked by this memory cgroup will be unmapped and freed.
297 pages cannot be freed because it's locked or in-use. Such pages are moved 369 Some pages cannot be freed because they are locked or in-use. Such pages are
298 to parent and this cgroup will be empty. But this may return -EBUSY in 370 moved to parent and this cgroup will be empty. This may return -EBUSY if
299 some too busy case. 371 VM is too busy to free/move all pages immediately.
300 372
301 Typical use case of this interface is that calling this before rmdir(). 373 Typical use case of this interface is that calling this before rmdir().
302 Because rmdir() moves all pages to parent, some out-of-use page caches can be 374 Because rmdir() moves all pages to parent, some out-of-use page caches can be
@@ -306,19 +378,41 @@ will be charged as a new owner of it.
306 378
307memory.stat file includes following statistics 379memory.stat file includes following statistics
308 380
381# per-memory cgroup local status
309cache - # of bytes of page cache memory. 382cache - # of bytes of page cache memory.
310rss - # of bytes of anonymous and swap cache memory. 383rss - # of bytes of anonymous and swap cache memory.
384mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
311pgpgin - # of pages paged in (equivalent to # of charging events). 385pgpgin - # of pages paged in (equivalent to # of charging events).
312pgpgout - # of pages paged out (equivalent to # of uncharging events). 386pgpgout - # of pages paged out (equivalent to # of uncharging events).
313active_anon - # of bytes of anonymous and swap cache memory on active 387swap - # of bytes of swap usage
314 lru list.
315inactive_anon - # of bytes of anonymous memory and swap cache memory on 388inactive_anon - # of bytes of anonymous memory and swap cache memory on
316 inactive lru list. 389 LRU list.
317active_file - # of bytes of file-backed memory on active lru list. 390active_anon - # of bytes of anonymous and swap cache memory on active
318inactive_file - # of bytes of file-backed memory on inactive lru list. 391 inactive LRU list.
392inactive_file - # of bytes of file-backed memory on inactive LRU list.
393active_file - # of bytes of file-backed memory on active LRU list.
319unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). 394unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
320 395
321The following additional stats are dependent on CONFIG_DEBUG_VM. 396# status considering hierarchy (see memory.use_hierarchy settings)
397
398hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
399 under which the memory cgroup is
400hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
401 hierarchy under which memory cgroup is.
402
403total_cache - sum of all children's "cache"
404total_rss - sum of all children's "rss"
405total_mapped_file - sum of all children's "cache"
406total_pgpgin - sum of all children's "pgpgin"
407total_pgpgout - sum of all children's "pgpgout"
408total_swap - sum of all children's "swap"
409total_inactive_anon - sum of all children's "inactive_anon"
410total_active_anon - sum of all children's "active_anon"
411total_inactive_file - sum of all children's "inactive_file"
412total_active_file - sum of all children's "active_file"
413total_unevictable - sum of all children's "unevictable"
414
415# The following additional stats are dependent on CONFIG_DEBUG_VM.
322 416
323inactive_ratio - VM internal parameter. (see mm/page_alloc.c) 417inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
324recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 418recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
@@ -327,24 +421,37 @@ recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
327recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 421recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
328 422
329Memo: 423Memo:
330 recent_rotated means recent frequency of lru rotation. 424 recent_rotated means recent frequency of LRU rotation.
331 recent_scanned means recent # of scans to lru. 425 recent_scanned means recent # of scans to LRU.
332 showing for better debug please see the code for meanings. 426 showing for better debug please see the code for meanings.
333 427
334Note: 428Note:
335 Only anonymous and swap cache memory is listed as part of 'rss' stat. 429 Only anonymous and swap cache memory is listed as part of 'rss' stat.
336 This should not be confused with the true 'resident set size' or the 430 This should not be confused with the true 'resident set size' or the
337 amount of physical memory used by the cgroup. Per-cgroup rss 431 amount of physical memory used by the cgroup.
338 accounting is not done yet. 432 'rss + file_mapped" will give you resident set size of cgroup.
433 (Note: file and shmem may be shared among other cgroups. In that case,
434 file_mapped is accounted only when the memory cgroup is owner of page
435 cache.)
339 436
3405.3 swappiness 4375.3 swappiness
341 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
342 438
343 Following cgroups' swappiness can't be changed. 439Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
344 - root cgroup (uses /proc/sys/vm/swappiness).
345 - a cgroup which uses hierarchy and it has child cgroup.
346 - a cgroup which uses hierarchy and not the root of hierarchy.
347 440
441Following cgroups' swappiness can't be changed.
442- root cgroup (uses /proc/sys/vm/swappiness).
443- a cgroup which uses hierarchy and it has other cgroup(s) below it.
444- a cgroup which uses hierarchy and not the root of hierarchy.
445
4465.4 failcnt
447
448A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
449This failcnt(== failure count) shows the number of times that a usage counter
450hit its limit. When a memory cgroup hits a limit, failcnt increases and
451memory under it will be reclaimed.
452
453You can reset failcnt by writing 0 to failcnt file.
454# echo 0 > .../memory.failcnt
348 455
3496. Hierarchy support 4566. Hierarchy support
350 457
@@ -363,13 +470,13 @@ hierarchy
363 470
364In the diagram above, with hierarchical accounting enabled, all memory 471In the diagram above, with hierarchical accounting enabled, all memory
365usage of e, is accounted to its ancestors up until the root (i.e, c and root), 472usage of e, is accounted to its ancestors up until the root (i.e, c and root),
366that has memory.use_hierarchy enabled. If one of the ancestors goes over its 473that has memory.use_hierarchy enabled. If one of the ancestors goes over its
367limit, the reclaim algorithm reclaims from the tasks in the ancestor and the 474limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
368children of the ancestor. 475children of the ancestor.
369 476
3706.1 Enabling hierarchical accounting and reclaim 4776.1 Enabling hierarchical accounting and reclaim
371 478
372The memory controller by default disables the hierarchy feature. Support 479A memory cgroup by default disables the hierarchy feature. Support
373can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup 480can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
374 481
375# echo 1 > memory.use_hierarchy 482# echo 1 > memory.use_hierarchy
@@ -379,10 +486,10 @@ The feature can be disabled by
379# echo 0 > memory.use_hierarchy 486# echo 0 > memory.use_hierarchy
380 487
381NOTE1: Enabling/disabling will fail if the cgroup already has other 488NOTE1: Enabling/disabling will fail if the cgroup already has other
382cgroups created below it. 489 cgroups created below it.
383 490
384NOTE2: When panic_on_oom is set to "2", the whole system will panic in 491NOTE2: When panic_on_oom is set to "2", the whole system will panic in
385case of an oom event in any cgroup. 492 case of an OOM event in any cgroup.
386 493
3877. Soft limits 4947. Soft limits
388 495
@@ -392,7 +499,7 @@ is to allow control groups to use as much of the memory as needed, provided
392a. There is no memory contention 499a. There is no memory contention
393b. They do not exceed their hard limit 500b. They do not exceed their hard limit
394 501
395When the system detects memory contention or low memory control groups 502When the system detects memory contention or low memory, control groups
396are pushed back to their soft limits. If the soft limit of each control 503are pushed back to their soft limits. If the soft limit of each control
397group is very high, they are pushed back as much as possible to make 504group is very high, they are pushed back as much as possible to make
398sure that one control group does not starve the others of memory. 505sure that one control group does not starve the others of memory.
@@ -406,7 +513,7 @@ it gets invoked from balance_pgdat (kswapd).
4067.1 Interface 5137.1 Interface
407 514
408Soft limits can be setup by using the following commands (in this example we 515Soft limits can be setup by using the following commands (in this example we
409assume a soft limit of 256 megabytes) 516assume a soft limit of 256 MiB)
410 517
411# echo 256M > memory.soft_limit_in_bytes 518# echo 256M > memory.soft_limit_in_bytes
412 519
@@ -442,7 +549,7 @@ Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
442Note: If we cannot find enough space for the task in the destination cgroup, we 549Note: If we cannot find enough space for the task in the destination cgroup, we
443 try to make space by reclaiming memory. Task migration may fail if we 550 try to make space by reclaiming memory. Task migration may fail if we
444 cannot make enough space. 551 cannot make enough space.
445Note: It can take several seconds if you move charges in giga bytes order. 552Note: It can take several seconds if you move charges much.
446 553
447And if you want disable it again: 554And if you want disable it again:
448 555
@@ -451,21 +558,27 @@ And if you want disable it again:
4518.2 Type of charges which can be move 5588.2 Type of charges which can be move
452 559
453Each bits of move_charge_at_immigrate has its own meaning about what type of 560Each bits of move_charge_at_immigrate has its own meaning about what type of
454charges should be moved. 561charges should be moved. But in any cases, it must be noted that an account of
562a page or a swap can be moved only when it is charged to the task's current(old)
563memory cgroup.
455 564
456 bit | what type of charges would be moved ? 565 bit | what type of charges would be moved ?
457 -----+------------------------------------------------------------------------ 566 -----+------------------------------------------------------------------------
458 0 | A charge of an anonymous page(or swap of it) used by the target task. 567 0 | A charge of an anonymous page(or swap of it) used by the target task.
459 | Those pages and swaps must be used only by the target task. You must 568 | Those pages and swaps must be used only by the target task. You must
460 | enable Swap Extension(see 2.4) to enable move of swap charges. 569 | enable Swap Extension(see 2.4) to enable move of swap charges.
461 570 -----+------------------------------------------------------------------------
462Note: Those pages and swaps must be charged to the old cgroup. 571 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
463Note: More type of pages(e.g. file cache, shmem,) will be supported by other 572 | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
464 bits in future. 573 | anonymous pages, file pages(and swaps) in the range mmapped by the task
574 | will be moved even if the task hasn't done page fault, i.e. they might
575 | not be the task's "RSS", but other task's "RSS" that maps the same file.
576 | And mapcount of the page is ignored(the page can be moved even if
577 | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to
578 | enable move of swap charges.
465 579
4668.3 TODO 5808.3 TODO
467 581
468- Add support for other types of pages(e.g. file cache, shmem, etc.).
469- Implement madvise(2) to let users decide the vma to be moved or not to be 582- Implement madvise(2) to let users decide the vma to be moved or not to be
470 moved. 583 moved.
471- All of moving charge operations are done under cgroup_mutex. It's not good 584- All of moving charge operations are done under cgroup_mutex. It's not good
@@ -473,22 +586,61 @@ Note: More type of pages(e.g. file cache, shmem,) will be supported by other
473 586
4749. Memory thresholds 5879. Memory thresholds
475 588
476Memory controler implements memory thresholds using cgroups notification 589Memory cgroup implements memory thresholds using cgroups notification
477API (see cgroups.txt). It allows to register multiple memory and memsw 590API (see cgroups.txt). It allows to register multiple memory and memsw
478thresholds and gets notifications when it crosses. 591thresholds and gets notifications when it crosses.
479 592
480To register a threshold application need: 593To register a threshold application need:
481 - create an eventfd using eventfd(2); 594- create an eventfd using eventfd(2);
482 - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; 595- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
483 - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to 596- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
484 cgroup.event_control. 597 cgroup.event_control.
485 598
486Application will be notified through eventfd when memory usage crosses 599Application will be notified through eventfd when memory usage crosses
487threshold in any direction. 600threshold in any direction.
488 601
489It's applicable for root and non-root cgroup. 602It's applicable for root and non-root cgroup.
490 603
49110. TODO 60410. OOM Control
605
606memory.oom_control file is for OOM notification and other controls.
607
608Memory cgroup implements OOM notifier using cgroup notification
609API (See cgroups.txt). It allows to register multiple OOM notification
610delivery and gets notification when OOM happens.
611
612To register a notifier, application need:
613 - create an eventfd using eventfd(2)
614 - open memory.oom_control file
615 - write string like "<event_fd> <fd of memory.oom_control>" to
616 cgroup.event_control
617
618Application will be notified through eventfd when OOM happens.
619OOM notification doesn't work for root cgroup.
620
621You can disable OOM-killer by writing "1" to memory.oom_control file, as:
622
623 #echo 1 > memory.oom_control
624
625This operation is only allowed to the top cgroup of sub-hierarchy.
626If OOM-killer is disabled, tasks under cgroup will hang/sleep
627in memory cgroup's OOM-waitqueue when they request accountable memory.
628
629For running them, you have to relax the memory cgroup's OOM status by
630 * enlarge limit or reduce usage.
631To reduce usage,
632 * kill some tasks.
633 * move some tasks to other group with account migration.
634 * remove some files (on tmpfs?)
635
636Then, stopped tasks will work again.
637
638At reading, current status of OOM is shown.
639 oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
640 under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
641 be stopped.)
642
64311. TODO
492 644
4931. Add support for accounting huge pages (as a separate controller) 6451. Add support for accounting huge pages (as a separate controller)
4942. Make per-cgroup scanner reclaim not-shared pages first 6462. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
index 78c9466a9aa8..e5c5f5e6ab70 100644
--- a/Documentation/connector/connector.txt
+++ b/Documentation/connector/connector.txt
@@ -88,7 +88,7 @@ int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
88 int gfp_mask - GFP mask. 88 int gfp_mask - GFP mask.
89 89
90 Note: When registering new callback user, connector core assigns 90 Note: When registering new callback user, connector core assigns
91 netlink group to the user which is equal to it's id.idx. 91 netlink group to the user which is equal to its id.idx.
92 92
93/*****************************************/ 93/*****************************************/
94Protocol description. 94Protocol description.
diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt
index df03169782ea..a2db35287003 100644
--- a/Documentation/credentials.txt
+++ b/Documentation/credentials.txt
@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example:
408 ... 408 ...
409 } 409 }
410 410
411A function need not get RCU read lock to use __task_cred() if it is holding a
412spinlock at the time as this implicitly holds the RCU read lock.
413
414Should it be necessary to hold another task's credentials for a long period of 411Should it be necessary to hold another task's credentials for a long period of
415time, and possibly to sleep whilst doing so, then the caller should get a 412time, and possibly to sleep whilst doing so, then the caller should get a
416reference on them using: 413reference on them using:
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller:
426 uid_t task_uid(task) Task's real UID 423 uid_t task_uid(task) Task's real UID
427 uid_t task_euid(task) Task's effective UID 424 uid_t task_euid(task) Task's effective UID
428 425
429If the caller is holding a spinlock or the RCU read lock at the time anyway, 426If the caller is holding the RCU read lock at the time anyway, then:
430then:
431 427
432 __task_cred(task)->uid 428 __task_cred(task)->uid
433 __task_cred(task)->euid 429 __task_cred(task)->euid
434 430
435should be used instead. Similarly, if multiple aspects of a task's credentials 431should be used instead. Similarly, if multiple aspects of a task's credentials
436need to be accessed, RCU read lock or a spinlock should be used, __task_cred() 432need to be accessed, RCU read lock should be used, __task_cred() called, the
437called, the result stored in a temporary pointer and then the credential 433result stored in a temporary pointer and then the credential aspects called
438aspects called from that before dropping the lock. This prevents the 434from that before dropping the lock. This prevents the potentially expensive
439potentially expensive RCU magic from being invoked multiple times. 435RCU magic from being invoked multiple times.
440 436
441Should some other single aspect of another task's credentials need to be 437Should some other single aspect of another task's credentials need to be
442accessed, then this can be used: 438accessed, then this can be used:
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process
index d750321acd5a..97726eba6102 100644
--- a/Documentation/development-process/2.Process
+++ b/Documentation/development-process/2.Process
@@ -151,7 +151,7 @@ The stages that a patch goes through are, generally:
151 well. 151 well.
152 152
153 - Wider review. When the patch is getting close to ready for mainline 153 - Wider review. When the patch is getting close to ready for mainline
154 inclusion, it will be accepted by a relevant subsystem maintainer - 154 inclusion, it should be accepted by a relevant subsystem maintainer -
155 though this acceptance is not a guarantee that the patch will make it 155 though this acceptance is not a guarantee that the patch will make it
156 all the way to the mainline. The patch will show up in the maintainer's 156 all the way to the mainline. The patch will show up in the maintainer's
157 subsystem tree and into the staging trees (described below). When the 157 subsystem tree and into the staging trees (described below). When the
@@ -159,6 +159,15 @@ The stages that a patch goes through are, generally:
159 the discovery of any problems resulting from the integration of this 159 the discovery of any problems resulting from the integration of this
160 patch with work being done by others. 160 patch with work being done by others.
161 161
162- Please note that most maintainers also have day jobs, so merging
163 your patch may not be their highest priority. If your patch is
164 getting feedback about changes that are needed, you should either
165 make those changes or justify why they should not be made. If your
166 patch has no review complaints but is not being merged by its
167 appropriate subsystem or driver maintainer, you should be persistent
168 in updating the patch to the current kernel so that it applies cleanly
169 and keep sending it for review and merging.
170
162 - Merging into the mainline. Eventually, a successful patch will be 171 - Merging into the mainline. Eventually, a successful patch will be
163 merged into the mainline repository managed by Linus Torvalds. More 172 merged into the mainline repository managed by Linus Torvalds. More
164 comments and/or problems may surface at this time; it is important that 173 comments and/or problems may surface at this time; it is important that
@@ -258,12 +267,8 @@ an appropriate subsystem tree or be sent directly to Linus. In a typical
258development cycle, approximately 10% of the patches going into the mainline 267development cycle, approximately 10% of the patches going into the mainline
259get there via -mm. 268get there via -mm.
260 269
261The current -mm patch can always be found from the front page of 270The current -mm patch is available in the "mmotm" (-mm of the moment)
262 271directory at:
263 http://kernel.org/
264
265Those who want to see the current state of -mm can get the "-mm of the
266moment" tree, found at:
267 272
268 http://userweb.kernel.org/~akpm/mmotm/ 273 http://userweb.kernel.org/~akpm/mmotm/
269 274
@@ -298,6 +303,12 @@ volatility of linux-next tends to make it a difficult development target.
298See http://lwn.net/Articles/289013/ for more information on this topic, and 303See http://lwn.net/Articles/289013/ for more information on this topic, and
299stay tuned; much is still in flux where linux-next is involved. 304stay tuned; much is still in flux where linux-next is involved.
300 305
306Besides the mmotm and linux-next trees, the kernel source tree now contains
307the drivers/staging/ directory and many sub-directories for drivers or
308filesystems that are on their way to being added to the kernel tree
309proper, but they remain in drivers/staging/ while they still need more
310work.
311
301 312
3022.5: TOOLS 3132.5: TOOLS
303 314
@@ -319,9 +330,9 @@ developers; even if they do not use it for their own work, they'll need git
319to keep up with what other developers (and the mainline) are doing. 330to keep up with what other developers (and the mainline) are doing.
320 331
321Git is now packaged by almost all Linux distributions. There is a home 332Git is now packaged by almost all Linux distributions. There is a home
322page at 333page at:
323 334
324 http://git.or.cz/ 335 http://git-scm.com/
325 336
326That page has pointers to documentation and tutorials. One should be 337That page has pointers to documentation and tutorials. One should be
327aware, in particular, of the Kernel Hacker's Guide to git, which has 338aware, in particular, of the Kernel Hacker's Guide to git, which has
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics
index a2cf74093aa1..837179447e17 100644
--- a/Documentation/development-process/7.AdvancedTopics
+++ b/Documentation/development-process/7.AdvancedTopics
@@ -25,7 +25,7 @@ long document in its own right. Instead, the focus here will be on how git
25fits into the kernel development process in particular. Developers who 25fits into the kernel development process in particular. Developers who
26wish to come up to speed with git will find more information at: 26wish to come up to speed with git will find more information at:
27 27
28 http://git.or.cz/ 28 http://git-scm.com/
29 29
30 http://www.kernel.org/pub/software/scm/git/docs/user-manual.html 30 http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
31 31
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 53d64d382343..1d83d124056c 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -443,6 +443,8 @@ Your cooperation is appreciated.
443 231 = /dev/snapshot System memory snapshot device 443 231 = /dev/snapshot System memory snapshot device
444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) 444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions)
445 233 = /dev/kmview View-OS A process with a view 445 233 = /dev/kmview View-OS A process with a view
446 234 = /dev/btrfs-control Btrfs control device
447 235 = /dev/autofs Autofs control device
446 240-254 Reserved for local use 448 240-254 Reserved for local use
447 255 Reserved for MISC_DYNAMIC_MINOR 449 255 Reserved for MISC_DYNAMIC_MINOR
448 450
diff --git a/Documentation/dvb/ci.txt b/Documentation/dvb/ci.txt
index 2ecd834585e6..4a0c2b56e690 100644
--- a/Documentation/dvb/ci.txt
+++ b/Documentation/dvb/ci.txt
@@ -41,7 +41,7 @@ This application requires the following to function properly as of now.
41 41
42* Cards that fall in this category 42* Cards that fall in this category
43~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 43~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44At present the cards that fall in this category are the Twinhan and it's 44At present the cards that fall in this category are the Twinhan and its
45clones, these cards are available as VVMER, Tomato, Hercules, Orange and 45clones, these cards are available as VVMER, Tomato, Hercules, Orange and
46so on. 46so on.
47 47
diff --git a/Documentation/dvb/contributors.txt b/Documentation/dvb/contributors.txt
index 4865addebe1c..47c30098dab6 100644
--- a/Documentation/dvb/contributors.txt
+++ b/Documentation/dvb/contributors.txt
@@ -1,7 +1,7 @@
1Thanks go to the following people for patches and contributions: 1Thanks go to the following people for patches and contributions:
2 2
3Michael Hunold <m.hunold@gmx.de> 3Michael Hunold <m.hunold@gmx.de>
4 for the initial saa7146 driver and it's recent overhaul 4 for the initial saa7146 driver and its recent overhaul
5 5
6Christian Theiss 6Christian Theiss
7 for his work on the initial Linux DVB driver 7 for his work on the initial Linux DVB driver
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ed511af0f79a..672be0109d02 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -241,16 +241,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
241 241
242--------------------------- 242---------------------------
243 243
244What (Why):
245 - xt_recent: the old ipt_recent proc dir
246 (superseded by /proc/net/xt_recent)
247
248When: January 2009 or Linux 2.7.0, whichever comes first
249Why: Superseded by newer revisions or modules
250Who: Jan Engelhardt <jengelh@computergmbh.de>
251
252---------------------------
253
254What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib 244What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib
255When: February 2010 245When: February 2010
256Why: All callers should use explicit gpio_request()/gpio_free(). 246Why: All callers should use explicit gpio_request()/gpio_free().
@@ -520,26 +510,21 @@ Who: Hans de Goede <hdegoede@redhat.com>
520 510
521---------------------------- 511----------------------------
522 512
523What: corgikbd, spitzkbd, tosakbd driver 513What: sysfs-class-rfkill state file
524When: 2.6.35 514When: Feb 2014
525Files: drivers/input/keyboard/{corgi,spitz,tosa}kbd.c 515Files: net/rfkill/core.c
526Why: We now have a generic GPIO based matrix keyboard driver that 516Why: Documented as obsolete since Feb 2010. This file is limited to 3
527 are fully capable of handling all the keys on these devices. 517 states while the rfkill drivers can have 4 states.
528 The original drivers manipulate the GPIO registers directly 518Who: anybody or Florian Mickler <florian@mickler.org>
529 and so are difficult to maintain.
530Who: Eric Miao <eric.y.miao@gmail.com>
531 519
532---------------------------- 520----------------------------
533 521
534What: corgi_ssp and corgi_ts driver 522What: sysfs-class-rfkill claim file
535When: 2.6.35 523When: Feb 2012
536Files: arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c 524Files: net/rfkill/core.c
537Why: The corgi touchscreen is now deprecated in favour of the generic 525Why: It is not possible to claim an rfkill driver since 2007. This is
538 ads7846.c driver. The noise reduction technique used in corgi_ts.c, 526 Documented as obsolete since Feb 2010.
539 that's to wait till vsync before ADC sampling, is also integrated into 527Who: anybody or Florian Mickler <florian@mickler.org>
540 ads7846 driver now. Provided that the original driver is not generic
541 and is difficult to maintain, it will be removed later.
542Who: Eric Miao <eric.y.miao@gmail.com>
543 528
544---------------------------- 529----------------------------
545 530
@@ -564,6 +549,16 @@ Who: Avi Kivity <avi@redhat.com>
564 549
565---------------------------- 550----------------------------
566 551
552What: xtime, wall_to_monotonic
553When: 2.6.36+
554Files: kernel/time/timekeeping.c include/linux/time.h
555Why: Cleaning up timekeeping internal values. Please use
556 existing timekeeping accessor functions to access
557 the equivalent functionality.
558Who: John Stultz <johnstul@us.ibm.com>
559
560----------------------------
561
567What: KVM kernel-allocated memory slots 562What: KVM kernel-allocated memory slots
568When: July 2010 563When: July 2010
569Why: Since 2.6.25, kvm supports user-allocated memory slots, which are 564Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
@@ -589,3 +584,75 @@ Why: Useful in 2003, implementation is a hack.
589 Generally invoked by accident today. 584 Generally invoked by accident today.
590 Seen as doing more harm than good. 585 Seen as doing more harm than good.
591Who: Len Brown <len.brown@intel.com> 586Who: Len Brown <len.brown@intel.com>
587
588----------------------------
589
590What: iwlwifi 50XX module parameters
591When: 2.6.40
592Why: The "..50" modules parameters were used to configure 5000 series and
593 up devices; different set of module parameters also available for 4965
594 with same functionalities. Consolidate both set into single place
595 in drivers/net/wireless/iwlwifi/iwl-agn.c
596
597Who: Wey-Yi Guy <wey-yi.w.guy@intel.com>
598
599----------------------------
600
601What: iwl4965 alias support
602When: 2.6.40
603Why: Internal alias support has been present in module-init-tools for some
604 time, the MODULE_ALIAS("iwl4965") boilerplate aliases can be removed
605 with no impact.
606
607Who: Wey-Yi Guy <wey-yi.w.guy@intel.com>
608
609---------------------------
610
611What: xt_NOTRACK
612Files: net/netfilter/xt_NOTRACK.c
613When: April 2011
614Why: Superseded by xt_CT
615Who: Netfilter developer team <netfilter-devel@vger.kernel.org>
616
617---------------------------
618
619What: video4linux /dev/vtx teletext API support
620When: 2.6.35
621Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
622 include/linux/videotext.h
623Why: The vtx device nodes have been superseded by vbi device nodes
624 for many years. No applications exist that use the vtx support.
625 Of the two i2c drivers that actually support this API the saa5249
626 has been impossible to use for a year now and no known hardware
627 that supports this device exists. The saa5246a is theoretically
628 supported by the old mxb boards, but it never actually worked.
629
630 In summary: there is no hardware that can use this API and there
631 are no applications actually implementing this API.
632
633 The vtx support still reserves minors 192-223 and we would really
634 like to reuse those for upcoming new functionality. In the unlikely
635 event that new hardware appears that wants to use the functionality
636 provided by the vtx API, then that functionality should be build
637 around the sliced VBI API instead.
638Who: Hans Verkuil <hverkuil@xs4all.nl>
639
640----------------------------
641
642What: IRQF_DISABLED
643When: 2.6.36
644Why: The flag is a NOOP as we run interrupt handlers with interrupts disabled
645Who: Thomas Gleixner <tglx@linutronix.de>
646
647----------------------------
648
649What: old ieee1394 subsystem (CONFIG_IEEE1394)
650When: 2.6.37
651Files: drivers/ieee1394/ except init_ohci1394_dma.c
652Why: superseded by drivers/firewire/ (CONFIG_FIREWIRE) which offers more
653 features, better performance, and better security, all with smaller
654 and more modern code base
655Who: Stefan Richter <stefanr@s5r6.in-berlin.de>
656
657----------------------------
658
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 06bbbed71206..96d4293607ec 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,7 +178,7 @@ prototypes:
178locking rules: 178locking rules:
179 All except set_page_dirty may block 179 All except set_page_dirty may block
180 180
181 BKL PageLocked(page) i_sem 181 BKL PageLocked(page) i_mutex
182writepage: no yes, unlocks (see below) 182writepage: no yes, unlocks (see below)
183readpage: no yes, unlocks 183readpage: no yes, unlocks
184sync_page: no maybe 184sync_page: no maybe
@@ -380,7 +380,7 @@ prototypes:
380 int (*open) (struct inode *, struct file *); 380 int (*open) (struct inode *, struct file *);
381 int (*flush) (struct file *); 381 int (*flush) (struct file *);
382 int (*release) (struct inode *, struct file *); 382 int (*release) (struct inode *, struct file *);
383 int (*fsync) (struct file *, struct dentry *, int datasync); 383 int (*fsync) (struct file *, int datasync);
384 int (*aio_fsync) (struct kiocb *, int datasync); 384 int (*aio_fsync) (struct kiocb *, int datasync);
385 int (*fasync) (int, struct file *, int); 385 int (*fasync) (int, struct file *, int);
386 int (*lock) (struct file *, int, struct file_lock *); 386 int (*lock) (struct file *, int, struct file_lock *);
@@ -429,8 +429,9 @@ check_flags: no
429implementations. If your fs is not using generic_file_llseek, you 429implementations. If your fs is not using generic_file_llseek, you
430need to acquire and release the appropriate locks in your ->llseek(). 430need to acquire and release the appropriate locks in your ->llseek().
431For many filesystems, it is probably safe to acquire the inode 431For many filesystems, it is probably safe to acquire the inode
432semaphore. Note some filesystems (i.e. remote ones) provide no 432mutex or just to use i_size_read() instead.
433protection for i_size so you will need to use the BKL. 433Note: this does not protect the file->f_pos against concurrent modifications
434since this is something the userspace has to take care about.
434 435
435Note: ext2_release() was *the* source of contention on fs-intensive 436Note: ext2_release() was *the* source of contention on fs-intensive
436loads and dropping BKL on ->release() helps to get rid of that (we still 437loads and dropping BKL on ->release() helps to get rid of that (we still
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index 8f78ded4b648..51986bf08a4d 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was
146used for this as raw Netlink would lead to a significant increase in 146used for this as raw Netlink would lead to a significant increase in
147complexity. There's no question that the Generic Netlink system is an 147complexity. There's no question that the Generic Netlink system is an
148elegant solution for common case ioctl functions but it's not a complete 148elegant solution for common case ioctl functions but it's not a complete
149replacement probably because it's primary purpose in life is to be a 149replacement probably because its primary purpose in life is to be a
150message bus implementation rather than specifically an ioctl replacement. 150message bus implementation rather than specifically an ioctl replacement.
151While it would be possible to work around this there is one concern 151While it would be possible to work around this there is one concern
152that lead to the decision to not use it. This is that the autofs 152that lead to the decision to not use it. This is that the autofs
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 0660c9f5deef..763d8ebbbebd 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -90,7 +90,7 @@ Mount Options
90 Specify the IP and/or port the client should bind to locally. 90 Specify the IP and/or port the client should bind to locally.
91 There is normally not much reason to do this. If the IP is not 91 There is normally not much reason to do this. If the IP is not
92 specified, the client's IP address is determined by looking at the 92 specified, the client's IP address is determined by looking at the
93 address it's connection to the monitor originates from. 93 address its connection to the monitor originates from.
94 94
95 wsize=X 95 wsize=X
96 Specify the maximum write size in bytes. By default there is no 96 Specify the maximum write size in bytes. By default there is no
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
index c50bbb2d52b4..1b528b2ad809 100644
--- a/Documentation/filesystems/dlmfs.txt
+++ b/Documentation/filesystems/dlmfs.txt
@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in
47your lockspace can access. The easiest way to do this is via 47your lockspace can access. The easiest way to do this is via
48ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires 48ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
49that an OCFS2 file system be in place so that it can automatically 49that an OCFS2 file system be in place so that it can automatically
50find it's heartbeat area, though it will eventually support heartbeat 50find its heartbeat area, though it will eventually support heartbeat
51against raw disks. 51against raw disks.
52 52
53Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed 53Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 867c5b50cb42..272f80d5f966 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -59,8 +59,19 @@ commit=nrsec (*) Ext3 can be told to sync all its data and metadata
59 Setting it to very large values will improve 59 Setting it to very large values will improve
60 performance. 60 performance.
61 61
62barrier=1 This enables/disables barriers. barrier=0 disables 62barrier=<0(*)|1> This enables/disables the use of write barriers in
63 it, barrier=1 enables it. 63barrier the jbd code. barrier=0 disables, barrier=1 enables.
64nobarrier (*) This also requires an IO stack which can support
65 barriers, and if jbd gets an error on a barrier
66 write, it will disable again with a warning.
67 Write barriers enforce proper on-disk ordering
68 of journal commits, making volatile disk write caches
69 safe to use, at some performance penalty. If
70 your disks are battery-backed in one way or another,
71 disabling barriers may safely improve performance.
72 The mount options "barrier" and "nobarrier" can
73 also be used to enable or disable barriers, for
74 consistency with other ext3 mount options.
64 75
65orlov (*) This enables the new Orlov block allocator. It is 76orlov (*) This enables the new Orlov block allocator. It is
66 enabled by default. 77 enabled by default.
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
index 606233cd4618..1b805a0efbb0 100644
--- a/Documentation/filesystems/fiemap.txt
+++ b/Documentation/filesystems/fiemap.txt
@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain
38the set of flags which caused the error. If the kernel is compatible 38the set of flags which caused the error. If the kernel is compatible
39with all flags passed, the contents of fm_flags will be unmodified. 39with all flags passed, the contents of fm_flags will be unmodified.
40It is up to userspace to determine whether rejection of a particular 40It is up to userspace to determine whether rejection of a particular
41flag is fatal to it's operation. This scheme is intended to allow the 41flag is fatal to its operation. This scheme is intended to allow the
42fiemap interface to grow in the future but without losing 42fiemap interface to grow in the future but without losing
43compatibility with old software. 43compatibility with old software.
44 44
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents.
56 56
57* FIEMAP_FLAG_XATTR 57* FIEMAP_FLAG_XATTR
58If this flag is set, the extents returned will describe the inodes 58If this flag is set, the extents returned will describe the inodes
59extended attribute lookup tree, instead of it's data tree. 59extended attribute lookup tree, instead of its data tree.
60 60
61 61
62Extent Mapping 62Extent Mapping
@@ -89,7 +89,7 @@ struct fiemap_extent {
89}; 89};
90 90
91All offsets and lengths are in bytes and mirror those on disk. It is valid 91All offsets and lengths are in bytes and mirror those on disk. It is valid
92for an extents logical offset to start before the request or it's logical 92for an extents logical offset to start before the request or its logical
93length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is 93length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
94returned, fe_logical, fe_physical, and fe_length will be aligned to the 94returned, fe_logical, fe_physical, and fe_length will be aligned to the
95block size of the file system. With the exception of extents flagged as 95block size of the file system. With the exception of extents flagged as
@@ -125,7 +125,7 @@ been allocated for the file yet.
125 125
126* FIEMAP_EXTENT_DELALLOC 126* FIEMAP_EXTENT_DELALLOC
127 - This will also set FIEMAP_EXTENT_UNKNOWN. 127 - This will also set FIEMAP_EXTENT_UNKNOWN.
128Delayed allocation - while there is data for this extent, it's 128Delayed allocation - while there is data for this extent, its
129physical location has not been allocated yet. 129physical location has not been allocated yet.
130 130
131* FIEMAP_EXTENT_ENCODED 131* FIEMAP_EXTENT_ENCODED
@@ -159,7 +159,7 @@ Data is located within a meta data block.
159Data is packed into a block with data from other files. 159Data is packed into a block with data from other files.
160 160
161* FIEMAP_EXTENT_UNWRITTEN 161* FIEMAP_EXTENT_UNWRITTEN
162Unwritten extent - the extent is allocated but it's data has not been 162Unwritten extent - the extent is allocated but its data has not been
163initialized. This indicates the extent's data will be all zero if read 163initialized. This indicates the extent's data will be all zero if read
164through the filesystem but the contents are undefined if read directly from 164through the filesystem but the contents are undefined if read directly from
165the device. 165the device.
@@ -176,7 +176,7 @@ VFS -> File System Implementation
176 176
177File systems wishing to support fiemap must implement a ->fiemap callback on 177File systems wishing to support fiemap must implement a ->fiemap callback on
178their inode_operations structure. The fs ->fiemap call is responsible for 178their inode_operations structure. The fs ->fiemap call is responsible for
179defining it's set of supported fiemap flags, and calling a helper function on 179defining its set of supported fiemap flags, and calling a helper function on
180each discovered extent: 180each discovered extent:
181 181
182struct inode_operations { 182struct inode_operations {
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 397a41adb4c3..13af4a49e7db 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -91,7 +91,7 @@ Mount options
91'default_permissions' 91'default_permissions'
92 92
93 By default FUSE doesn't check file access permissions, the 93 By default FUSE doesn't check file access permissions, the
94 filesystem is free to implement it's access policy or leave it to 94 filesystem is free to implement its access policy or leave it to
95 the underlying file access mechanism (e.g. in case of network 95 the underlying file access mechanism (e.g. in case of network
96 filesystems). This option enables permission checking, restricting 96 filesystems). This option enables permission checking, restricting
97 access based on file mode. It is usually useful together with the 97 access based on file mode. It is usually useful together with the
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with
171the error set to EINTR. 171the error set to EINTR.
172 172
173It is also possible that there's a race between processing the 173It is also possible that there's a race between processing the
174original request and it's INTERRUPT request. There are two possibilities: 174original request and its INTERRUPT request. There are two possibilities:
175 175
176 1) The INTERRUPT request is processed before the original request is 176 1) The INTERRUPT request is processed before the original request is
177 processed 177 processed
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 5e3ab8f3beff..0b59c0200912 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -1,7 +1,7 @@
1Global File System 1Global File System
2------------------ 2------------------
3 3
4http://sources.redhat.com/cluster/ 4http://sources.redhat.com/cluster/wiki/
5 5
6GFS is a cluster file system. It allows a cluster of computers to 6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC, 7simultaneously use a block device that is shared between them (with FC,
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it
36is pretty close. 36is pretty close.
37 37
38The following man pages can be found at the URL above: 38The following man pages can be found at the URL above:
39 fsck.gfs2 to repair a filesystem 39 fsck.gfs2 to repair a filesystem
40 gfs2_grow to expand a filesystem online 40 gfs2_grow to expand a filesystem online
41 gfs2_jadd to add journals to a filesystem online 41 gfs2_jadd to add journals to a filesystem online
42 gfs2_tool to manipulate, examine and tune a filesystem 42 gfs2_tool to manipulate, examine and tune a filesystem
43 gfs2_quota to examine and change quota values in a filesystem 43 gfs2_quota to examine and change quota values in a filesystem
44 gfs2_convert to convert a gfs filesystem to gfs2 in-place 44 gfs2_convert to convert a gfs filesystem to gfs2 in-place
45 mount.gfs2 to help mount(8) mount a filesystem 45 mount.gfs2 to help mount(8) mount a filesystem
46 mkfs.gfs2 to make a filesystem 46 mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt
index fa45c3baed98..74630bd504fb 100644
--- a/Documentation/filesystems/hpfs.txt
+++ b/Documentation/filesystems/hpfs.txt
@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI.
103Codepages 103Codepages
104 104
105HPFS can contain several uppercasing tables for several codepages and each 105HPFS can contain several uppercasing tables for several codepages and each
106file has a pointer to codepage it's name is in. However OS/2 was created in 106file has a pointer to codepage its name is in. However OS/2 was created in
107America where people don't care much about codepages and so multiple codepages 107America where people don't care much about codepages and so multiple codepages
108support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. 108support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
109Once I booted English OS/2 working in cp 850 and I created a file on my 852 109Once I booted English OS/2 working in cp 850 and I created a file on my 852
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
index e64c94ba401a..bca42c22a143 100644
--- a/Documentation/filesystems/logfs.txt
+++ b/Documentation/filesystems/logfs.txt
@@ -59,7 +59,7 @@ Levels
59------ 59------
60 60
61Garbage collection (GC) may fail if all data is written 61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is seperated 62indiscriminately. One requirement of GC is that data is separated
63roughly according to the distance between the tree root and the data. 63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks 64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, 65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
@@ -67,7 +67,7 @@ respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks. 67for indirect blocks.
68 68
69Each segment contains objects of a single level only. As a result, 69Each segment contains objects of a single level only. As a result,
70each level requires its own seperate segment to be open for writing. 70each level requires its own separate segment to be open for writing.
71 71
72Inode File 72Inode File
73---------- 73----------
@@ -106,9 +106,9 @@ Vim
106--- 106---
107 107
108By cleverly predicting the life time of data, it is possible to 108By cleverly predicting the life time of data, it is possible to
109seperate long-living data from short-living data and thereby reduce 109separate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can 110the GC overhead later. Each type of distinc life expectency (vim) can
111have a seperate segment open for writing. Each (level, vim) tupel can 111have a separate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered 112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth. 113at mount time, it is closed and ignored henceforth.
114 114
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 6a53a84afc72..04884914a1c8 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 |
137 | READ | REQ | | Section 18.22 | 137 | READ | REQ | | Section 18.22 |
138 | READDIR | REQ | | Section 18.23 | 138 | READDIR | REQ | | Section 18.23 |
139 | READLINK | OPT | | Section 18.24 | 139 | READLINK | OPT | | Section 18.24 |
140NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | 140 | RECLAIM_COMPLETE | REQ | | Section 18.51 |
141 | RELEASE_LOCKOWNER | MNI | | N/A | 141 | RELEASE_LOCKOWNER | MNI | | N/A |
142 | REMOVE | REQ | | Section 18.25 | 142 | REMOVE | REQ | | Section 18.25 |
143 | RENAME | REQ | | Section 18.26 | 143 | RENAME | REQ | | Section 18.26 |
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea6808..ebcaaee21616 100644
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'.
185request/response format 185request/response format
186----------------------- 186-----------------------
187 187
188While each cache is free to use it's own format for requests 188While each cache is free to use its own format for requests
189and responses over channel, the following is recommended as 189and responses over channel, the following is recommended as
190appropriate and support routines are available to help: 190appropriate and support routines are available to help:
191Each request or response record should be printable ASCII 191Each request or response record should be printable ASCII
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index cf6d0d85ca82..d3e7673995eb 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -50,8 +50,8 @@ NILFS2 supports the following mount options:
50(*) == default 50(*) == default
51 51
52nobarrier Disables barriers. 52nobarrier Disables barriers.
53errors=continue(*) Keep going on a filesystem error. 53errors=continue Keep going on a filesystem error.
54errors=remount-ro Remount the filesystem read-only on an error. 54errors=remount-ro(*) Remount the filesystem read-only on an error.
55errors=panic Panic and halt the machine if an error occurs. 55errors=panic Panic and halt the machine if an error occurs.
56cp=n Specify the checkpoint-number of the snapshot to be 56cp=n Specify the checkpoint-number of the snapshot to be
57 mounted. Checkpoints and snapshots are listed by lscp 57 mounted. Checkpoints and snapshots are listed by lscp
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c58b9f5ba002..1f7ae144f6d8 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -80,3 +80,10 @@ user_xattr (*) Enables Extended User Attributes.
80nouser_xattr Disables Extended User Attributes. 80nouser_xattr Disables Extended User Attributes.
81acl Enables POSIX Access Control Lists support. 81acl Enables POSIX Access Control Lists support.
82noacl (*) Disables POSIX Access Control Lists support. 82noacl (*) Disables POSIX Access Control Lists support.
83resv_level=2 (*) Set how agressive allocation reservations will be.
84 Valid values are between 0 (reservations off) to 8
85 (maximum space for reservations).
86dir_resv_level= (*) By default, directory reservations will scale with file
87 reservations - users should rarely need to change this
88 value. If allocation reservations are turned off, this
89 option will have no effect.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1f..9fb6cbe70bde 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -305,7 +305,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
305 cgtime guest time of the task children in jiffies 305 cgtime guest time of the task children in jiffies
306.............................................................................. 306..............................................................................
307 307
308The /proc/PID/map file containing the currently mapped memory regions and 308The /proc/PID/maps file containing the currently mapped memory regions and
309their access permissions. 309their access permissions.
310 310
311The format is: 311The format is:
@@ -316,7 +316,7 @@ address perms offset dev inode pathname
31608049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 31608049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3170804a000-0806b000 rw-p 00000000 00:00 0 [heap] 3170804a000-0806b000 rw-p 00000000 00:00 0 [heap]
318a7cb1000-a7cb2000 ---p 00000000 00:00 0 318a7cb1000-a7cb2000 ---p 00000000 00:00 0
319a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] 319a7cb2000-a7eb2000 rw-p 00000000 00:00 0
320a7eb2000-a7eb3000 ---p 00000000 00:00 0 320a7eb2000-a7eb3000 ---p 00000000 00:00 0
321a7eb3000-a7ed5000 rw-p 00000000 00:00 0 321a7eb3000-a7ed5000 rw-p 00000000 00:00 0
322a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 322a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
@@ -352,7 +352,6 @@ is not associated with a file:
352 [stack] = the stack of the main process 352 [stack] = the stack of the main process
353 [vdso] = the "virtual dynamic shared object", 353 [vdso] = the "virtual dynamic shared object",
354 the kernel system call handler 354 the kernel system call handler
355 [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size
356 355
357 or if empty, the mapping is anonymous. 356 or if empty, the mapping is anonymous.
358 357
@@ -566,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
566IRQs which have not yet been allocated/activated, and hence which lack a 565IRQs which have not yet been allocated/activated, and hence which lack a
567/proc/irq/[0-9]* directory. 566/proc/irq/[0-9]* directory.
568 567
568The node file on an SMP system shows the node to which the device using the IRQ
569reports itself as being attached. This hardware locality information does not
570include information about any possible driver locality preference.
571
569prof_cpu_mask specifies which CPUs are to be profiled by the system wide 572prof_cpu_mask specifies which CPUs are to be profiled by the system wide
570profiler. Default value is ffffffff (all cpus). 573profiler. Default value is ffffffff (all cpus).
571 574
@@ -965,7 +968,7 @@ your system and how much traffic was routed over those devices:
965 ...] 1375103 17405 0 0 0 0 0 0 968 ...] 1375103 17405 0 0 0 0 0 0
966 ...] 1703981 5535 0 0 0 3 0 0 969 ...] 1703981 5535 0 0 0 3 0 0
967 970
968In addition, each Channel Bond interface has it's own directory. For 971In addition, each Channel Bond interface has its own directory. For
969example, the bond0 device will have a directory called /proc/net/bond0/. 972example, the bond0 device will have a directory called /proc/net/bond0/.
970It will contain information that is specific to that bond, such as the 973It will contain information that is specific to that bond, such as the
971current slaves of the bond, the link status of the slaves, and how 974current slaves of the bond, the link status of the slaves, and how
@@ -1362,7 +1365,7 @@ been accounted as having caused 1MB of write.
1362In other words: The number of bytes which this process caused to not happen, 1365In other words: The number of bytes which this process caused to not happen,
1363by truncating pagecache. A task can cause "negative" IO too. If this task 1366by truncating pagecache. A task can cause "negative" IO too. If this task
1364truncates some dirty pagecache, some IO which another task has been accounted 1367truncates some dirty pagecache, some IO which another task has been accounted
1365for (in it's write_bytes) will not be happening. We _could_ just subtract that 1368for (in its write_bytes) will not be happening. We _could_ just subtract that
1366from the truncating task's write_bytes, but there is information loss in doing 1369from the truncating task's write_bytes, but there is information loss in doing
1367that. 1370that.
1368 1371
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
index f673ef0de0f7..194fb0decd2c 100644
--- a/Documentation/filesystems/smbfs.txt
+++ b/Documentation/filesystems/smbfs.txt
@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT.
3Smbfs was inspired by Samba, the program written by Andrew Tridgell 3Smbfs was inspired by Samba, the program written by Andrew Tridgell
4that turns any Unix host into a file server for DOS or Windows clients. 4that turns any Unix host into a file server for DOS or Windows clients.
5 5
6Smbfs is a SMB client, but uses parts of samba for it's operation. For 6Smbfs is a SMB client, but uses parts of samba for its operation. For
7more info on samba, including documentation, please go to 7more info on samba, including documentation, please go to
8http://www.samba.org/ and then on to your nearest mirror. 8http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index b324c033035a..203f7202cc9e 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -38,7 +38,8 @@ Hard link support: yes no
38Real inode numbers: yes no 38Real inode numbers: yes no
3932-bit uids/gids: yes no 3932-bit uids/gids: yes no
40File creation time: yes no 40File creation time: yes no
41Xattr and ACL support: no no 41Xattr support: yes no
42ACL support: no no
42 43
43Squashfs compresses data, inodes and directories. In addition, inode and 44Squashfs compresses data, inodes and directories. In addition, inode and
44directory data are highly compacted, and packed on byte boundaries. Each 45directory data are highly compacted, and packed on byte boundaries. Each
@@ -58,7 +59,7 @@ obtained from this site also.
583. SQUASHFS FILESYSTEM DESIGN 593. SQUASHFS FILESYSTEM DESIGN
59----------------------------- 60-----------------------------
60 61
61A squashfs filesystem consists of seven parts, packed together on a byte 62A squashfs filesystem consists of a maximum of eight parts, packed together on a byte
62alignment: 63alignment:
63 64
64 --------------- 65 ---------------
@@ -80,6 +81,9 @@ alignment:
80 |---------------| 81 |---------------|
81 | uid/gid | 82 | uid/gid |
82 | lookup table | 83 | lookup table |
84 |---------------|
85 | xattr |
86 | table |
83 --------------- 87 ---------------
84 88
85Compressed data blocks are written to the filesystem as files are read from 89Compressed data blocks are written to the filesystem as files are read from
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks. A second index table is
192used to locate these. This second index table for speed of access (and because 196used to locate these. This second index table for speed of access (and because
193it is small) is read at mount time and cached in memory. 197it is small) is read at mount time and cached in memory.
194 198
1993.7 Xattr table
200---------------
201
202The xattr table contains extended attributes for each inode. The xattrs
203for each inode are stored in a list, each list entry containing a type,
204name and value field. The type field encodes the xattr prefix
205("user.", "trusted." etc) and it also encodes how the name/value fields
206should be interpreted. Currently the type indicates whether the value
207is stored inline (in which case the value field contains the xattr value),
208or if it is stored out of line (in which case the value field stores a
209reference to where the actual value is stored). This allows large values
210to be stored out of line improving scanning and lookup performance and it
211also allows values to be de-duplicated, the value being stored once, and
212all other occurences holding an out of line reference to that value.
213
214The xattr lists are packed into compressed 8K metadata blocks.
215To reduce overhead in inodes, rather than storing the on-disk
216location of the xattr list inside each inode, a 32-bit xattr id
217is stored. This xattr id is mapped into the location of the xattr
218list using a second xattr id lookup table.
195 219
1964. TODOS AND OUTSTANDING ISSUES 2204. TODOS AND OUTSTANDING ISSUES
197------------------------------- 221-------------------------------
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory.
1994.1 Todo list 2234.1 Todo list
200------------- 224-------------
201 225
202Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks 226Implement ACL support.
203for these but the code has not been written. Once the code has been written
204the existing layout should not require modification.
205 227
2064.2 Squashfs internal cache 2284.2 Squashfs internal cache
207--------------------------- 229---------------------------
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt
new file mode 100644
index 000000000000..caaaf1266d8f
--- /dev/null
+++ b/Documentation/filesystems/sysfs-tagging.txt
@@ -0,0 +1,42 @@
1Sysfs tagging
2-------------
3
4(Taken almost verbatim from Eric Biederman's netns tagging patch
5commit msg)
6
7The problem. Network devices show up in sysfs and with the network
8namespace active multiple devices with the same name can show up in
9the same directory, ouch!
10
11To avoid that problem and allow existing applications in network
12namespaces to see the same interface that is currently presented in
13sysfs, sysfs now has tagging directory support.
14
15By using the network namespace pointers as tags to separate out the
16the sysfs directory entries we ensure that we don't have conflicts
17in the directories and applications only see a limited set of
18the network devices.
19
20Each sysfs directory entry may be tagged with zero or one
21namespaces. A sysfs_dirent is augmented with a void *s_ns. If a
22directory entry is tagged, then sysfs_dirent->s_flags will have a
23flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will
24point to the namespace to which it belongs.
25
26Each sysfs superblock's sysfs_super_info contains an array void
27*ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace
28kobj_nstype first mounts sysfs, a new superblock is created. It
29will be differentiated from other sysfs mounts by having its
30s_fs_info->ns[kobj_nstype] set to the new namespace. Note that
31through bind mounting and mounts propagation, a task can easily view
32the contents of other namespaces' sysfs mounts. Therefore, when a
33namespace exits, it will call kobj_ns_exit() to invalidate any
34sysfs_dirent->s_ns pointers pointing to it.
35
36Users of this interface:
37- define a type in the kobj_ns_type enumeration.
38- call kobj_ns_type_register() with its kobj_ns_type_operations which has
39 - current_ns() which returns current's namespace
40 - netlink_ns() which returns a socket's namespace
41 - initial_ns() which returns the initial namesapce
42- call kobj_ns_exit() when an individual tag is no longer valid
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index fe09a2cb1858..98ef55124158 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -94,11 +94,19 @@ NodeList format is a comma-separated list of decimal numbers and ranges,
94a range being two hyphen-separated decimal numbers, the smallest and 94a range being two hyphen-separated decimal numbers, the smallest and
95largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 95largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15
96 96
97A memory policy with a valid NodeList will be saved, as specified, for
98use at file creation time. When a task allocates a file in the file
99system, the mount option memory policy will be applied with a NodeList,
100if any, modified by the calling task's cpuset constraints
101[See Documentation/cgroups/cpusets.txt] and any optional flags, listed
102below. If the resulting NodeLists is the empty set, the effective memory
103policy for the file will revert to "default" policy.
104
97NUMA memory allocation policies have optional flags that can be used in 105NUMA memory allocation policies have optional flags that can be used in
98conjunction with their modes. These optional flags can be specified 106conjunction with their modes. These optional flags can be specified
99when tmpfs is mounted by appending them to the mode before the NodeList. 107when tmpfs is mounted by appending them to the mode before the NodeList.
100See Documentation/vm/numa_memory_policy.txt for a list of all available 108See Documentation/vm/numa_memory_policy.txt for a list of all available
101memory allocation policy mode flags. 109memory allocation policy mode flags and their effect on memory policy.
102 110
103 =static is equivalent to MPOL_F_STATIC_NODES 111 =static is equivalent to MPOL_F_STATIC_NODES
104 =relative is equivalent to MPOL_F_RELATIVE_NODES 112 =relative is equivalent to MPOL_F_RELATIVE_NODES
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32edd90..94677e7dcb13 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file
72descriptors). The freshly allocated file structure is initialized with 72descriptors). The freshly allocated file structure is initialized with
73a pointer to the dentry and a set of file operation member functions. 73a pointer to the dentry and a set of file operation member functions.
74These are taken from the inode data. The open() file method is then 74These are taken from the inode data. The open() file method is then
75called so the specific filesystem implementation can do it's work. You 75called so the specific filesystem implementation can do its work. You
76can see that this is another switch performed by the VFS. The file 76can see that this is another switch performed by the VFS. The file
77structure is placed into the file descriptor table for the process. 77structure is placed into the file descriptor table for the process.
78 78
@@ -401,11 +401,16 @@ otherwise noted.
401 started might not be in the page cache at the end of the 401 started might not be in the page cache at the end of the
402 walk). 402 walk).
403 403
404 truncate: called by the VFS to change the size of a file. The 404 truncate: Deprecated. This will not be called if ->setsize is defined.
405 Called by the VFS to change the size of a file. The
405 i_size field of the inode is set to the desired size by the 406 i_size field of the inode is set to the desired size by the
406 VFS before this method is called. This method is called by 407 VFS before this method is called. This method is called by
407 the truncate(2) system call and related functionality. 408 the truncate(2) system call and related functionality.
408 409
410 Note: ->truncate and vmtruncate are deprecated. Do not add new
411 instances/calls of these. Filesystems should be converted to do their
412 truncate sequence via ->setattr().
413
409 permission: called by the VFS to check for access rights on a POSIX-like 414 permission: called by the VFS to check for access rights on a POSIX-like
410 filesystem. 415 filesystem.
411 416
@@ -729,7 +734,7 @@ struct file_operations {
729 int (*open) (struct inode *, struct file *); 734 int (*open) (struct inode *, struct file *);
730 int (*flush) (struct file *); 735 int (*flush) (struct file *);
731 int (*release) (struct inode *, struct file *); 736 int (*release) (struct inode *, struct file *);
732 int (*fsync) (struct file *, struct dentry *, int datasync); 737 int (*fsync) (struct file *, int datasync);
733 int (*aio_fsync) (struct kiocb *, int datasync); 738 int (*aio_fsync) (struct kiocb *, int datasync);
734 int (*fasync) (int, struct file *, int); 739 int (*fasync) (int, struct file *, int);
735 int (*lock) (struct file *, int, struct file_lock *); 740 int (*lock) (struct file *, int, struct file_lock *);
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
new file mode 100644
index 000000000000..d8119e9d2d60
--- /dev/null
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -0,0 +1,816 @@
1XFS Delayed Logging Design
2--------------------------
3
4Introduction to Re-logging in XFS
5---------------------------------
6
7XFS logging is a combination of logical and physical logging. Some objects,
8such as inodes and dquots, are logged in logical format where the details
9logged are made up of the changes to in-core structures rather than on-disk
10structures. Other objects - typically buffers - have their physical changes
11logged. The reason for these differences is to reduce the amount of log space
12required for objects that are frequently logged. Some parts of inodes are more
13frequently logged than others, and inodes are typically more frequently logged
14than any other object (except maybe the superblock buffer) so keeping the
15amount of metadata logged low is of prime importance.
16
17The reason that this is such a concern is that XFS allows multiple separate
18modifications to a single object to be carried in the log at any given time.
19This allows the log to avoid needing to flush each change to disk before
20recording a new change to the object. XFS does this via a method called
21"re-logging". Conceptually, this is quite simple - all it requires is that any
22new change to the object is recorded with a *new copy* of all the existing
23changes in the new transaction that is written to the log.
24
25That is, if we have a sequence of changes A through to F, and the object was
26written to disk after change D, we would see in the log the following series
27of transactions, their contents and the log sequence number (LSN) of the
28transaction:
29
30 Transaction Contents LSN
31 A A X
32 B A+B X+n
33 C A+B+C X+n+m
34 D A+B+C+D X+n+m+o
35 <object written to disk>
36 E E Y (> X+n+m+o)
37 F E+F YÙ+p
38
39In other words, each time an object is relogged, the new transaction contains
40the aggregation of all the previous changes currently held only in the log.
41
42This relogging technique also allows objects to be moved forward in the log so
43that an object being relogged does not prevent the tail of the log from ever
44moving forward. This can be seen in the table above by the changing
45(increasing) LSN of each subsquent transaction - the LSN is effectively a
46direct encoding of the location in the log of the transaction.
47
48This relogging is also used to implement long-running, multiple-commit
49transactions. These transaction are known as rolling transactions, and require
50a special log reservation known as a permanent transaction reservation. A
51typical example of a rolling transaction is the removal of extents from an
52inode which can only be done at a rate of two extents per transaction because
53of reservation size limitations. Hence a rolling extent removal transaction
54keeps relogging the inode and btree buffers as they get modified in each
55removal operation. This keeps them moving forward in the log as the operation
56progresses, ensuring that current operation never gets blocked by itself if the
57log wraps around.
58
59Hence it can be seen that the relogging operation is fundamental to the correct
60working of the XFS journalling subsystem. From the above description, most
61people should be able to see why the XFS metadata operations writes so much to
62the log - repeated operations to the same objects write the same changes to
63the log over and over again. Worse is the fact that objects tend to get
64dirtier as they get relogged, so each subsequent transaction is writing more
65metadata into the log.
66
67Another feature of the XFS transaction subsystem is that most transactions are
68asynchronous. That is, they don't commit to disk until either a log buffer is
69filled (a log buffer can hold multiple transactions) or a synchronous operation
70forces the log buffers holding the transactions to disk. This means that XFS is
71doing aggregation of transactions in memory - batching them, if you like - to
72minimise the impact of the log IO on transaction throughput.
73
74The limitation on asynchronous transaction throughput is the number and size of
75log buffers made available by the log manager. By default there are 8 log
76buffers available and the size of each is 32kB - the size can be increased up
77to 256kB by use of a mount option.
78
79Effectively, this gives us the maximum bound of outstanding metadata changes
80that can be made to the filesystem at any point in time - if all the log
81buffers are full and under IO, then no more transactions can be committed until
82the current batch completes. It is now common for a single current CPU core to
83be to able to issue enough transactions to keep the log buffers full and under
84IO permanently. Hence the XFS journalling subsystem can be considered to be IO
85bound.
86
87Delayed Logging: Concepts
88-------------------------
89
90The key thing to note about the asynchronous logging combined with the
91relogging technique XFS uses is that we can be relogging changed objects
92multiple times before they are committed to disk in the log buffers. If we
93return to the previous relogging example, it is entirely possible that
94transactions A through D are committed to disk in the same log buffer.
95
96That is, a single log buffer may contain multiple copies of the same object,
97but only one of those copies needs to be there - the last one "D", as it
98contains all the changes from the previous changes. In other words, we have one
99necessary copy in the log buffer, and three stale copies that are simply
100wasting space. When we are doing repeated operations on the same set of
101objects, these "stale objects" can be over 90% of the space used in the log
102buffers. It is clear that reducing the number of stale objects written to the
103log would greatly reduce the amount of metadata we write to the log, and this
104is the fundamental goal of delayed logging.
105
106From a conceptual point of view, XFS is already doing relogging in memory (where
107memory == log buffer), only it is doing it extremely inefficiently. It is using
108logical to physical formatting to do the relogging because there is no
109infrastructure to keep track of logical changes in memory prior to physically
110formatting the changes in a transaction to the log buffer. Hence we cannot avoid
111accumulating stale objects in the log buffers.
112
113Delayed logging is the name we've given to keeping and tracking transactional
114changes to objects in memory outside the log buffer infrastructure. Because of
115the relogging concept fundamental to the XFS journalling subsystem, this is
116actually relatively easy to do - all the changes to logged items are already
117tracked in the current infrastructure. The big problem is how to accumulate
118them and get them to the log in a consistent, recoverable manner.
119Describing the problems and how they have been solved is the focus of this
120document.
121
122One of the key changes that delayed logging makes to the operation of the
123journalling subsystem is that it disassociates the amount of outstanding
124metadata changes from the size and number of log buffers available. In other
125words, instead of there only being a maximum of 2MB of transaction changes not
126written to the log at any point in time, there may be a much greater amount
127being accumulated in memory. Hence the potential for loss of metadata on a
128crash is much greater than for the existing logging mechanism.
129
130It should be noted that this does not change the guarantee that log recovery
131will result in a consistent filesystem. What it does mean is that as far as the
132recovered filesystem is concerned, there may be many thousands of transactions
133that simply did not occur as a result of the crash. This makes it even more
134important that applications that care about their data use fsync() where they
135need to ensure application level data integrity is maintained.
136
137It should be noted that delayed logging is not an innovative new concept that
138warrants rigorous proofs to determine whether it is correct or not. The method
139of accumulating changes in memory for some period before writing them to the
140log is used effectively in many filesystems including ext3 and ext4. Hence
141no time is spent in this document trying to convince the reader that the
142concept is sound. Instead it is simply considered a "solved problem" and as
143such implementing it in XFS is purely an exercise in software engineering.
144
145The fundamental requirements for delayed logging in XFS are simple:
146
147 1. Reduce the amount of metadata written to the log by at least
148 an order of magnitude.
149 2. Supply sufficient statistics to validate Requirement #1.
150 3. Supply sufficient new tracing infrastructure to be able to debug
151 problems with the new code.
152 4. No on-disk format change (metadata or log format).
153 5. Enable and disable with a mount option.
154 6. No performance regressions for synchronous transaction workloads.
155
156Delayed Logging: Design
157-----------------------
158
159Storing Changes
160
161The problem with accumulating changes at a logical level (i.e. just using the
162existing log item dirty region tracking) is that when it comes to writing the
163changes to the log buffers, we need to ensure that the object we are formatting
164is not changing while we do this. This requires locking the object to prevent
165concurrent modification. Hence flushing the logical changes to the log would
166require us to lock every object, format them, and then unlock them again.
167
168This introduces lots of scope for deadlocks with transactions that are already
169running. For example, a transaction has object A locked and modified, but needs
170the delayed logging tracking lock to commit the transaction. However, the
171flushing thread has the delayed logging tracking lock already held, and is
172trying to get the lock on object A to flush it to the log buffer. This appears
173to be an unsolvable deadlock condition, and it was solving this problem that
174was the barrier to implementing delayed logging for so long.
175
176The solution is relatively simple - it just took a long time to recognise it.
177Put simply, the current logging code formats the changes to each item into an
178vector array that points to the changed regions in the item. The log write code
179simply copies the memory these vectors point to into the log buffer during
180transaction commit while the item is locked in the transaction. Instead of
181using the log buffer as the destination of the formatting code, we can use an
182allocated memory buffer big enough to fit the formatted vector.
183
184If we then copy the vector into the memory buffer and rewrite the vector to
185point to the memory buffer rather than the object itself, we now have a copy of
186the changes in a format that is compatible with the log buffer writing code.
187that does not require us to lock the item to access. This formatting and
188rewriting can all be done while the object is locked during transaction commit,
189resulting in a vector that is transactionally consistent and can be accessed
190without needing to lock the owning item.
191
192Hence we avoid the need to lock items when we need to flush outstanding
193asynchronous transactions to the log. The differences between the existing
194formatting method and the delayed logging formatting can be seen in the
195diagram below.
196
197Current format log vector:
198
199Object +---------------------------------------------+
200Vector 1 +----+
201Vector 2 +----+
202Vector 3 +----------+
203
204After formatting:
205
206Log Buffer +-V1-+-V2-+----V3----+
207
208Delayed logging vector:
209
210Object +---------------------------------------------+
211Vector 1 +----+
212Vector 2 +----+
213Vector 3 +----------+
214
215After formatting:
216
217Memory Buffer +-V1-+-V2-+----V3----+
218Vector 1 +----+
219Vector 2 +----+
220Vector 3 +----------+
221
222The memory buffer and associated vector need to be passed as a single object,
223but still need to be associated with the parent object so if the object is
224relogged we can replace the current memory buffer with a new memory buffer that
225contains the latest changes.
226
227The reason for keeping the vector around after we've formatted the memory
228buffer is to support splitting vectors across log buffer boundaries correctly.
229If we don't keep the vector around, we do not know where the region boundaries
230are in the item, so we'd need a new encapsulation method for regions in the log
231buffer writing (i.e. double encapsulation). This would be an on-disk format
232change and as such is not desirable. It also means we'd have to write the log
233region headers in the formatting stage, which is problematic as there is per
234region state that needs to be placed into the headers during the log write.
235
236Hence we need to keep the vector, but by attaching the memory buffer to it and
237rewriting the vector addresses to point at the memory buffer we end up with a
238self-describing object that can be passed to the log buffer write code to be
239handled in exactly the same manner as the existing log vectors are handled.
240Hence we avoid needing a new on-disk format to handle items that have been
241relogged in memory.
242
243
244Tracking Changes
245
246Now that we can record transactional changes in memory in a form that allows
247them to be used without limitations, we need to be able to track and accumulate
248them so that they can be written to the log at some later point in time. The
249log item is the natural place to store this vector and buffer, and also makes sense
250to be the object that is used to track committed objects as it will always
251exist once the object has been included in a transaction.
252
253The log item is already used to track the log items that have been written to
254the log but not yet written to disk. Such log items are considered "active"
255and as such are stored in the Active Item List (AIL) which is a LSN-ordered
256double linked list. Items are inserted into this list during log buffer IO
257completion, after which they are unpinned and can be written to disk. An object
258that is in the AIL can be relogged, which causes the object to be pinned again
259and then moved forward in the AIL when the log buffer IO completes for that
260transaction.
261
262Essentially, this shows that an item that is in the AIL can still be modified
263and relogged, so any tracking must be separate to the AIL infrastructure. As
264such, we cannot reuse the AIL list pointers for tracking committed items, nor
265can we store state in any field that is protected by the AIL lock. Hence the
266committed item tracking needs it's own locks, lists and state fields in the log
267item.
268
269Similar to the AIL, tracking of committed items is done through a new list
270called the Committed Item List (CIL). The list tracks log items that have been
271committed and have formatted memory buffers attached to them. It tracks objects
272in transaction commit order, so when an object is relogged it is removed from
273it's place in the list and re-inserted at the tail. This is entirely arbitrary
274and done to make it easy for debugging - the last items in the list are the
275ones that are most recently modified. Ordering of the CIL is not necessary for
276transactional integrity (as discussed in the next section) so the ordering is
277done for convenience/sanity of the developers.
278
279
280Delayed Logging: Checkpoints
281
282When we have a log synchronisation event, commonly known as a "log force",
283all the items in the CIL must be written into the log via the log buffers.
284We need to write these items in the order that they exist in the CIL, and they
285need to be written as an atomic transaction. The need for all the objects to be
286written as an atomic transaction comes from the requirements of relogging and
287log replay - all the changes in all the objects in a given transaction must
288either be completely replayed during log recovery, or not replayed at all. If
289a transaction is not replayed because it is not complete in the log, then
290no later transactions should be replayed, either.
291
292To fulfill this requirement, we need to write the entire CIL in a single log
293transaction. Fortunately, the XFS log code has no fixed limit on the size of a
294transaction, nor does the log replay code. The only fundamental limit is that
295the transaction cannot be larger than just under half the size of the log. The
296reason for this limit is that to find the head and tail of the log, there must
297be at least one complete transaction in the log at any given time. If a
298transaction is larger than half the log, then there is the possibility that a
299crash during the write of a such a transaction could partially overwrite the
300only complete previous transaction in the log. This will result in a recovery
301failure and an inconsistent filesystem and hence we must enforce the maximum
302size of a checkpoint to be slightly less than a half the log.
303
304Apart from this size requirement, a checkpoint transaction looks no different
305to any other transaction - it contains a transaction header, a series of
306formatted log items and a commit record at the tail. From a recovery
307perspective, the checkpoint transaction is also no different - just a lot
308bigger with a lot more items in it. The worst case effect of this is that we
309might need to tune the recovery transaction object hash size.
310
311Because the checkpoint is just another transaction and all the changes to log
312items are stored as log vectors, we can use the existing log buffer writing
313code to write the changes into the log. To do this efficiently, we need to
314minimise the time we hold the CIL locked while writing the checkpoint
315transaction. The current log write code enables us to do this easily with the
316way it separates the writing of the transaction contents (the log vectors) from
317the transaction commit record, but tracking this requires us to have a
318per-checkpoint context that travels through the log write process through to
319checkpoint completion.
320
321Hence a checkpoint has a context that tracks the state of the current
322checkpoint from initiation to checkpoint completion. A new context is initiated
323at the same time a checkpoint transaction is started. That is, when we remove
324all the current items from the CIL during a checkpoint operation, we move all
325those changes into the current checkpoint context. We then initialise a new
326context and attach that to the CIL for aggregation of new transactions.
327
328This allows us to unlock the CIL immediately after transfer of all the
329committed items and effectively allow new transactions to be issued while we
330are formatting the checkpoint into the log. It also allows concurrent
331checkpoints to be written into the log buffers in the case of log force heavy
332workloads, just like the existing transaction commit code does. This, however,
333requires that we strictly order the commit records in the log so that
334checkpoint sequence order is maintained during log replay.
335
336To ensure that we can be writing an item into a checkpoint transaction at
337the same time another transaction modifies the item and inserts the log item
338into the new CIL, then checkpoint transaction commit code cannot use log items
339to store the list of log vectors that need to be written into the transaction.
340Hence log vectors need to be able to be chained together to allow them to be
341detatched from the log items. That is, when the CIL is flushed the memory
342buffer and log vector attached to each log item needs to be attached to the
343checkpoint context so that the log item can be released. In diagrammatic form,
344the CIL would look like this before the flush:
345
346 CIL Head
347 |
348 V
349 Log Item <-> log vector 1 -> memory buffer
350 | -> vector array
351 V
352 Log Item <-> log vector 2 -> memory buffer
353 | -> vector array
354 V
355 ......
356 |
357 V
358 Log Item <-> log vector N-1 -> memory buffer
359 | -> vector array
360 V
361 Log Item <-> log vector N -> memory buffer
362 -> vector array
363
364And after the flush the CIL head is empty, and the checkpoint context log
365vector list would look like:
366
367 Checkpoint Context
368 |
369 V
370 log vector 1 -> memory buffer
371 | -> vector array
372 | -> Log Item
373 V
374 log vector 2 -> memory buffer
375 | -> vector array
376 | -> Log Item
377 V
378 ......
379 |
380 V
381 log vector N-1 -> memory buffer
382 | -> vector array
383 | -> Log Item
384 V
385 log vector N -> memory buffer
386 -> vector array
387 -> Log Item
388
389Once this transfer is done, the CIL can be unlocked and new transactions can
390start, while the checkpoint flush code works over the log vector chain to
391commit the checkpoint.
392
393Once the checkpoint is written into the log buffers, the checkpoint context is
394attached to the log buffer that the commit record was written to along with a
395completion callback. Log IO completion will call that callback, which can then
396run transaction committed processing for the log items (i.e. insert into AIL
397and unpin) in the log vector chain and then free the log vector chain and
398checkpoint context.
399
400Discussion Point: I am uncertain as to whether the log item is the most
401efficient way to track vectors, even though it seems like the natural way to do
402it. The fact that we walk the log items (in the CIL) just to chain the log
403vectors and break the link between the log item and the log vector means that
404we take a cache line hit for the log item list modification, then another for
405the log vector chaining. If we track by the log vectors, then we only need to
406break the link between the log item and the log vector, which means we should
407dirty only the log item cachelines. Normally I wouldn't be concerned about one
408vs two dirty cachelines except for the fact I've seen upwards of 80,000 log
409vectors in one checkpoint transaction. I'd guess this is a "measure and
410compare" situation that can be done after a working and reviewed implementation
411is in the dev tree....
412
413Delayed Logging: Checkpoint Sequencing
414
415One of the key aspects of the XFS transaction subsystem is that it tags
416committed transactions with the log sequence number of the transaction commit.
417This allows transactions to be issued asynchronously even though there may be
418future operations that cannot be completed until that transaction is fully
419committed to the log. In the rare case that a dependent operation occurs (e.g.
420re-using a freed metadata extent for a data extent), a special, optimised log
421force can be issued to force the dependent transaction to disk immediately.
422
423To do this, transactions need to record the LSN of the commit record of the
424transaction. This LSN comes directly from the log buffer the transaction is
425written into. While this works just fine for the existing transaction
426mechanism, it does not work for delayed logging because transactions are not
427written directly into the log buffers. Hence some other method of sequencing
428transactions is required.
429
430As discussed in the checkpoint section, delayed logging uses per-checkpoint
431contexts, and as such it is simple to assign a sequence number to each
432checkpoint. Because the switching of checkpoint contexts must be done
433atomically, it is simple to ensure that each new context has a monotonically
434increasing sequence number assigned to it without the need for an external
435atomic counter - we can just take the current context sequence number and add
436one to it for the new context.
437
438Then, instead of assigning a log buffer LSN to the transaction commit LSN
439during the commit, we can assign the current checkpoint sequence. This allows
440operations that track transactions that have not yet completed know what
441checkpoint sequence needs to be committed before they can continue. As a
442result, the code that forces the log to a specific LSN now needs to ensure that
443the log forces to a specific checkpoint.
444
445To ensure that we can do this, we need to track all the checkpoint contexts
446that are currently committing to the log. When we flush a checkpoint, the
447context gets added to a "committing" list which can be searched. When a
448checkpoint commit completes, it is removed from the committing list. Because
449the checkpoint context records the LSN of the commit record for the checkpoint,
450we can also wait on the log buffer that contains the commit record, thereby
451using the existing log force mechanisms to execute synchronous forces.
452
453It should be noted that the synchronous forces may need to be extended with
454mitigation algorithms similar to the current log buffer code to allow
455aggregation of multiple synchronous transactions if there are already
456synchronous transactions being flushed. Investigation of the performance of the
457current design is needed before making any decisions here.
458
459The main concern with log forces is to ensure that all the previous checkpoints
460are also committed to disk before the one we need to wait for. Therefore we
461need to check that all the prior contexts in the committing list are also
462complete before waiting on the one we need to complete. We do this
463synchronisation in the log force code so that we don't need to wait anywhere
464else for such serialisation - it only matters when we do a log force.
465
466The only remaining complexity is that a log force now also has to handle the
467case where the forcing sequence number is the same as the current context. That
468is, we need to flush the CIL and potentially wait for it to complete. This is a
469simple addition to the existing log forcing code to check the sequence numbers
470and push if required. Indeed, placing the current sequence checkpoint flush in
471the log force code enables the current mechanism for issuing synchronous
472transactions to remain untouched (i.e. commit an asynchronous transaction, then
473force the log at the LSN of that transaction) and so the higher level code
474behaves the same regardless of whether delayed logging is being used or not.
475
476Delayed Logging: Checkpoint Log Space Accounting
477
478The big issue for a checkpoint transaction is the log space reservation for the
479transaction. We don't know how big a checkpoint transaction is going to be
480ahead of time, nor how many log buffers it will take to write out, nor the
481number of split log vector regions are going to be used. We can track the
482amount of log space required as we add items to the commit item list, but we
483still need to reserve the space in the log for the checkpoint.
484
485A typical transaction reserves enough space in the log for the worst case space
486usage of the transaction. The reservation accounts for log record headers,
487transaction and region headers, headers for split regions, buffer tail padding,
488etc. as well as the actual space for all the changed metadata in the
489transaction. While some of this is fixed overhead, much of it is dependent on
490the size of the transaction and the number of regions being logged (the number
491of log vectors in the transaction).
492
493An example of the differences would be logging directory changes versus logging
494inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then
495there are lots of transactions that only contain an inode core and an inode log
496format structure. That is, two vectors totaling roughly 150 bytes. If we modify
49710,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each
498vector is 12 bytes, so the total to be logged is approximately 1.75MB. In
499comparison, if we are logging full directory buffers, they are typically 4KB
500each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a
501buffer format structure for each buffer - roughly 800 vectors or 1.51MB total
502space. From this, it should be obvious that a static log space reservation is
503not particularly flexible and is difficult to select the "optimal value" for
504all workloads.
505
506Further, if we are going to use a static reservation, which bit of the entire
507reservation does it cover? We account for space used by the transaction
508reservation by tracking the space currently used by the object in the CIL and
509then calculating the increase or decrease in space used as the object is
510relogged. This allows for a checkpoint reservation to only have to account for
511log buffer metadata used such as log header records.
512
513However, even using a static reservation for just the log metadata is
514problematic. Typically log record headers use at least 16KB of log space per
5151MB of log space consumed (512 bytes per 32k) and the reservation needs to be
516large enough to handle arbitrary sized checkpoint transactions. This
517reservation needs to be made before the checkpoint is started, and we need to
518be able to reserve the space without sleeping. For a 8MB checkpoint, we need a
519reservation of around 150KB, which is a non-trivial amount of space.
520
521A static reservation needs to manipulate the log grant counters - we can take a
522permanent reservation on the space, but we still need to make sure we refresh
523the write reservation (the actual space available to the transaction) after
524every checkpoint transaction completion. Unfortunately, if this space is not
525available when required, then the regrant code will sleep waiting for it.
526
527The problem with this is that it can lead to deadlocks as we may need to commit
528checkpoints to be able to free up log space (refer back to the description of
529rolling transactions for an example of this). Hence we *must* always have
530space available in the log if we are to use static reservations, and that is
531very difficult and complex to arrange. It is possible to do, but there is a
532simpler way.
533
534The simpler way of doing this is tracking the entire log space used by the
535items in the CIL and using this to dynamically calculate the amount of log
536space required by the log metadata. If this log metadata space changes as a
537result of a transaction commit inserting a new memory buffer into the CIL, then
538the difference in space required is removed from the transaction that causes
539the change. Transactions at this level will *always* have enough space
540available in their reservation for this as they have already reserved the
541maximal amount of log metadata space they require, and such a delta reservation
542will always be less than or equal to the maximal amount in the reservation.
543
544Hence we can grow the checkpoint transaction reservation dynamically as items
545are added to the CIL and avoid the need for reserving and regranting log space
546up front. This avoids deadlocks and removes a blocking point from the
547checkpoint flush code.
548
549As mentioned early, transactions can't grow to more than half the size of the
550log. Hence as part of the reservation growing, we need to also check the size
551of the reservation against the maximum allowed transaction size. If we reach
552the maximum threshold, we need to push the CIL to the log. This is effectively
553a "background flush" and is done on demand. This is identical to
554a CIL push triggered by a log force, only that there is no waiting for the
555checkpoint commit to complete. This background push is checked and executed by
556transaction commit code.
557
558If the transaction subsystem goes idle while we still have items in the CIL,
559they will be flushed by the periodic log force issued by the xfssyncd. This log
560force will push the CIL to disk, and if the transaction subsystem stays idle,
561allow the idle log to be covered (effectively marked clean) in exactly the same
562manner that is done for the existing logging method. A discussion point is
563whether this log force needs to be done more frequently than the current rate
564which is once every 30s.
565
566
567Delayed Logging: Log Item Pinning
568
569Currently log items are pinned during transaction commit while the items are
570still locked. This happens just after the items are formatted, though it could
571be done any time before the items are unlocked. The result of this mechanism is
572that items get pinned once for every transaction that is committed to the log
573buffers. Hence items that are relogged in the log buffers will have a pin count
574for every outstanding transaction they were dirtied in. When each of these
575transactions is completed, they will unpin the item once. As a result, the item
576only becomes unpinned when all the transactions complete and there are no
577pending transactions. Thus the pinning and unpinning of a log item is symmetric
578as there is a 1:1 relationship with transaction commit and log item completion.
579
580For delayed logging, however, we have an assymetric transaction commit to
581completion relationship. Every time an object is relogged in the CIL it goes
582through the commit process without a corresponding completion being registered.
583That is, we now have a many-to-one relationship between transaction commit and
584log item completion. The result of this is that pinning and unpinning of the
585log items becomes unbalanced if we retain the "pin on transaction commit, unpin
586on transaction completion" model.
587
588To keep pin/unpin symmetry, the algorithm needs to change to a "pin on
589insertion into the CIL, unpin on checkpoint completion". In other words, the
590pinning and unpinning becomes symmetric around a checkpoint context. We have to
591pin the object the first time it is inserted into the CIL - if it is already in
592the CIL during a transaction commit, then we do not pin it again. Because there
593can be multiple outstanding checkpoint contexts, we can still see elevated pin
594counts, but as each checkpoint completes the pin count will retain the correct
595value according to it's context.
596
597Just to make matters more slightly more complex, this checkpoint level context
598for the pin count means that the pinning of an item must take place under the
599CIL commit/flush lock. If we pin the object outside this lock, we cannot
600guarantee which context the pin count is associated with. This is because of
601the fact pinning the item is dependent on whether the item is present in the
602current CIL or not. If we don't pin the CIL first before we check and pin the
603object, we have a race with CIL being flushed between the check and the pin
604(or not pinning, as the case may be). Hence we must hold the CIL flush/commit
605lock to guarantee that we pin the items correctly.
606
607Delayed Logging: Concurrent Scalability
608
609A fundamental requirement for the CIL is that accesses through transaction
610commits must scale to many concurrent commits. The current transaction commit
611code does not break down even when there are transactions coming from 2048
612processors at once. The current transaction code does not go any faster than if
613there was only one CPU using it, but it does not slow down either.
614
615As a result, the delayed logging transaction commit code needs to be designed
616for concurrency from the ground up. It is obvious that there are serialisation
617points in the design - the three important ones are:
618
619 1. Locking out new transaction commits while flushing the CIL
620 2. Adding items to the CIL and updating item space accounting
621 3. Checkpoint commit ordering
622
623Looking at the transaction commit and CIL flushing interactions, it is clear
624that we have a many-to-one interaction here. That is, the only restriction on
625the number of concurrent transactions that can be trying to commit at once is
626the amount of space available in the log for their reservations. The practical
627limit here is in the order of several hundred concurrent transactions for a
628128MB log, which means that it is generally one per CPU in a machine.
629
630The amount of time a transaction commit needs to hold out a flush is a
631relatively long period of time - the pinning of log items needs to be done
632while we are holding out a CIL flush, so at the moment that means it is held
633across the formatting of the objects into memory buffers (i.e. while memcpy()s
634are in progress). Ultimately a two pass algorithm where the formatting is done
635separately to the pinning of objects could be used to reduce the hold time of
636the transaction commit side.
637
638Because of the number of potential transaction commit side holders, the lock
639really needs to be a sleeping lock - if the CIL flush takes the lock, we do not
640want every other CPU in the machine spinning on the CIL lock. Given that
641flushing the CIL could involve walking a list of tens of thousands of log
642items, it will get held for a significant time and so spin contention is a
643significant concern. Preventing lots of CPUs spinning doing nothing is the
644main reason for choosing a sleeping lock even though nothing in either the
645transaction commit or CIL flush side sleeps with the lock held.
646
647It should also be noted that CIL flushing is also a relatively rare operation
648compared to transaction commit for asynchronous transaction workloads - only
649time will tell if using a read-write semaphore for exclusion will limit
650transaction commit concurrency due to cache line bouncing of the lock on the
651read side.
652
653The second serialisation point is on the transaction commit side where items
654are inserted into the CIL. Because transactions can enter this code
655concurrently, the CIL needs to be protected separately from the above
656commit/flush exclusion. It also needs to be an exclusive lock but it is only
657held for a very short time and so a spin lock is appropriate here. It is
658possible that this lock will become a contention point, but given the short
659hold time once per transaction I think that contention is unlikely.
660
661The final serialisation point is the checkpoint commit record ordering code
662that is run as part of the checkpoint commit and log force sequencing. The code
663path that triggers a CIL flush (i.e. whatever triggers the log force) will enter
664an ordering loop after writing all the log vectors into the log buffers but
665before writing the commit record. This loop walks the list of committing
666checkpoints and needs to block waiting for checkpoints to complete their commit
667record write. As a result it needs a lock and a wait variable. Log force
668sequencing also requires the same lock, list walk, and blocking mechanism to
669ensure completion of checkpoints.
670
671These two sequencing operations can use the mechanism even though the
672events they are waiting for are different. The checkpoint commit record
673sequencing needs to wait until checkpoint contexts contain a commit LSN
674(obtained through completion of a commit record write) while log force
675sequencing needs to wait until previous checkpoint contexts are removed from
676the committing list (i.e. they've completed). A simple wait variable and
677broadcast wakeups (thundering herds) has been used to implement these two
678serialisation queues. They use the same lock as the CIL, too. If we see too
679much contention on the CIL lock, or too many context switches as a result of
680the broadcast wakeups these operations can be put under a new spinlock and
681given separate wait lists to reduce lock contention and the number of processes
682woken by the wrong event.
683
684
685Lifecycle Changes
686
687The existing log item life cycle is as follows:
688
689 1. Transaction allocate
690 2. Transaction reserve
691 3. Lock item
692 4. Join item to transaction
693 If not already attached,
694 Allocate log item
695 Attach log item to owner item
696 Attach log item to transaction
697 5. Modify item
698 Record modifications in log item
699 6. Transaction commit
700 Pin item in memory
701 Format item into log buffer
702 Write commit LSN into transaction
703 Unlock item
704 Attach transaction to log buffer
705
706 <log buffer IO dispatched>
707 <log buffer IO completes>
708
709 7. Transaction completion
710 Mark log item committed
711 Insert log item into AIL
712 Write commit LSN into log item
713 Unpin log item
714 8. AIL traversal
715 Lock item
716 Mark log item clean
717 Flush item to disk
718
719 <item IO completion>
720
721 9. Log item removed from AIL
722 Moves log tail
723 Item unlocked
724
725Essentially, steps 1-6 operate independently from step 7, which is also
726independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9
727at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur
728at the same time. If the log item is in the AIL or between steps 6 and 7
729and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9
730are entered and completed is the object considered clean.
731
732With delayed logging, there are new steps inserted into the life cycle:
733
734 1. Transaction allocate
735 2. Transaction reserve
736 3. Lock item
737 4. Join item to transaction
738 If not already attached,
739 Allocate log item
740 Attach log item to owner item
741 Attach log item to transaction
742 5. Modify item
743 Record modifications in log item
744 6. Transaction commit
745 Pin item in memory if not pinned in CIL
746 Format item into log vector + buffer
747 Attach log vector and buffer to log item
748 Insert log item into CIL
749 Write CIL context sequence into transaction
750 Unlock item
751
752 <next log force>
753
754 7. CIL push
755 lock CIL flush
756 Chain log vectors and buffers together
757 Remove items from CIL
758 unlock CIL flush
759 write log vectors into log
760 sequence commit records
761 attach checkpoint context to log buffer
762
763 <log buffer IO dispatched>
764 <log buffer IO completes>
765
766 8. Checkpoint completion
767 Mark log item committed
768 Insert item into AIL
769 Write commit LSN into log item
770 Unpin log item
771 9. AIL traversal
772 Lock item
773 Mark log item clean
774 Flush item to disk
775 <item IO completion>
776 10. Log item removed from AIL
777 Moves log tail
778 Item unlocked
779
780From this, it can be seen that the only life cycle differences between the two
781logging methods are in the middle of the life cycle - they still have the same
782beginning and end and execution constraints. The only differences are in the
783commiting of the log items to the log itself and the completion processing.
784Hence delayed logging should not introduce any constraints on log item
785behaviour, allocation or freeing that don't already exist.
786
787As a result of this zero-impact "insertion" of delayed logging infrastructure
788and the design of the internal structures to avoid on disk format changes, we
789can basically switch between delayed logging and the existing mechanism with a
790mount option. Fundamentally, there is no reason why the log manager would not
791be able to swap methods automatically and transparently depending on load
792characteristics, but this should not be necessary if delayed logging works as
793designed.
794
795Roadmap:
796
7972.6.35 Inclusion in mainline as an experimental mount option
798 => approximately 2-3 months to merge window
799 => needs to be in xfs-dev tree in 4-6 weeks
800 => code is nearing readiness for review
801
8022.6.37 Remove experimental tag from mount option
803 => should be roughly 6 months after initial merge
804 => enough time to:
805 => gain confidence and fix problems reported by early
806 adopters (a.k.a. guinea pigs)
807 => address worst performance regressions and undesired
808 behaviours
809 => start tuning/optimising code for parallelism
810 => start tuning/optimising algorithms consuming
811 excessive CPU time
812
8132.6.39 Switch default mount option to use delayed logging
814 => should be roughly 12 months after initial merge
815 => enough time to shake out remaining problems before next round of
816 enterprise distro kernel rebases
diff --git a/Documentation/hwmon/dme1737 b/Documentation/hwmon/dme1737
index 001d2e70bc11..fc5df7654d63 100644
--- a/Documentation/hwmon/dme1737
+++ b/Documentation/hwmon/dme1737
@@ -9,11 +9,15 @@ Supported chips:
9 * SMSC SCH3112, SCH3114, SCH3116 9 * SMSC SCH3112, SCH3114, SCH3116
10 Prefix: 'sch311x' 10 Prefix: 'sch311x'
11 Addresses scanned: none, address read from Super-I/O config space 11 Addresses scanned: none, address read from Super-I/O config space
12 Datasheet: http://www.nuhorizons.com/FeaturedProducts/Volume1/SMSC/311x.pdf 12 Datasheet: Available on the Internet
13 * SMSC SCH5027 13 * SMSC SCH5027
14 Prefix: 'sch5027' 14 Prefix: 'sch5027'
15 Addresses scanned: I2C 0x2c, 0x2d, 0x2e 15 Addresses scanned: I2C 0x2c, 0x2d, 0x2e
16 Datasheet: Provided by SMSC upon request and under NDA 16 Datasheet: Provided by SMSC upon request and under NDA
17 * SMSC SCH5127
18 Prefix: 'sch5127'
19 Addresses scanned: none, address read from Super-I/O config space
20 Datasheet: Provided by SMSC upon request and under NDA
17 21
18Authors: 22Authors:
19 Juerg Haefliger <juergh@gmail.com> 23 Juerg Haefliger <juergh@gmail.com>
@@ -36,8 +40,8 @@ Description
36----------- 40-----------
37 41
38This driver implements support for the hardware monitoring capabilities of the 42This driver implements support for the hardware monitoring capabilities of the
39SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, and SMSC 43SMSC DME1737 and Asus A8000 (which are the same), SMSC SCH5027, SCH311x,
40SCH311x Super-I/O chips. These chips feature monitoring of 3 temp sensors 44and SCH5127 Super-I/O chips. These chips feature monitoring of 3 temp sensors
41temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and 45temp[1-3] (2 remote diodes and 1 internal), 7 voltages in[0-6] (6 external and
421 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement 461 internal) and up to 6 fan speeds fan[1-6]. Additionally, the chips implement
43up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and 47up to 5 PWM outputs pwm[1-3,5-6] for controlling fan speeds both manually and
@@ -48,14 +52,14 @@ Fan[3-6] and pwm[3,5-6] are optional features and their availability depends on
48the configuration of the chip. The driver will detect which features are 52the configuration of the chip. The driver will detect which features are
49present during initialization and create the sysfs attributes accordingly. 53present during initialization and create the sysfs attributes accordingly.
50 54
51For the SCH311x, fan[1-3] and pwm[1-3] are always present and fan[4-6] and 55For the SCH311x and SCH5127, fan[1-3] and pwm[1-3] are always present and
52pwm[5-6] don't exist. 56fan[4-6] and pwm[5-6] don't exist.
53 57
54The hardware monitoring features of the DME1737, A8000, and SCH5027 are only 58The hardware monitoring features of the DME1737, A8000, and SCH5027 are only
55accessible via SMBus, while the SCH311x only provides access via the ISA bus. 59accessible via SMBus, while the SCH311x and SCH5127 only provide access via
56The driver will therefore register itself as an I2C client driver if it detects 60the ISA bus. The driver will therefore register itself as an I2C client driver
57a DME1737, A8000, or SCH5027 and as a platform driver if it detects a SCH311x 61if it detects a DME1737, A8000, or SCH5027 and as a platform driver if it
58chip. 62detects a SCH311x or SCH5127 chip.
59 63
60 64
61Voltage Monitoring 65Voltage Monitoring
@@ -76,7 +80,7 @@ DME1737, A8000:
76 in6: Vbat (+3.0V) 0V - 4.38V 80 in6: Vbat (+3.0V) 0V - 4.38V
77 81
78SCH311x: 82SCH311x:
79 in0: +2.5V 0V - 6.64V 83 in0: +2.5V 0V - 3.32V
80 in1: Vccp (processor core) 0V - 2V 84 in1: Vccp (processor core) 0V - 2V
81 in2: VCC (internal +3.3V) 0V - 4.38V 85 in2: VCC (internal +3.3V) 0V - 4.38V
82 in3: +5V 0V - 6.64V 86 in3: +5V 0V - 6.64V
@@ -93,6 +97,15 @@ SCH5027:
93 in5: VTR (+3.3V standby) 0V - 4.38V 97 in5: VTR (+3.3V standby) 0V - 4.38V
94 in6: Vbat (+3.0V) 0V - 4.38V 98 in6: Vbat (+3.0V) 0V - 4.38V
95 99
100SCH5127:
101 in0: +2.5 0V - 3.32V
102 in1: Vccp (processor core) 0V - 3V
103 in2: VCC (internal +3.3V) 0V - 4.38V
104 in3: V2_IN 0V - 1.5V
105 in4: V1_IN 0V - 1.5V
106 in5: VTR (+3.3V standby) 0V - 4.38V
107 in6: Vbat (+3.0V) 0V - 4.38V
108
96Each voltage input has associated min and max limits which trigger an alarm 109Each voltage input has associated min and max limits which trigger an alarm
97when crossed. 110when crossed.
98 111
@@ -293,3 +306,21 @@ pwm[1-3]_auto_point1_pwm RW Auto PWM pwm point. Auto_point1 is the
293pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the 306pwm[1-3]_auto_point2_pwm RO Auto PWM pwm point. Auto_point2 is the
294 full-speed duty-cycle which is hard- 307 full-speed duty-cycle which is hard-
295 wired to 255 (100% duty-cycle). 308 wired to 255 (100% duty-cycle).
309
310Chip Differences
311----------------
312
313Feature dme1737 sch311x sch5027 sch5127
314-------------------------------------------------------
315temp[1-3]_offset yes yes
316vid yes
317zone3 yes yes yes
318zone[1-3]_hyst yes yes
319pwm min/off yes yes
320fan3 opt yes opt yes
321pwm3 opt yes opt yes
322fan4 opt opt
323fan5 opt opt
324pwm5 opt opt
325fan6 opt opt
326pwm6 opt opt
diff --git a/Documentation/hwmon/lm63 b/Documentation/hwmon/lm63
index 31660bf97979..b9843eab1afb 100644
--- a/Documentation/hwmon/lm63
+++ b/Documentation/hwmon/lm63
@@ -7,6 +7,11 @@ Supported chips:
7 Addresses scanned: I2C 0x4c 7 Addresses scanned: I2C 0x4c
8 Datasheet: Publicly available at the National Semiconductor website 8 Datasheet: Publicly available at the National Semiconductor website
9 http://www.national.com/pf/LM/LM63.html 9 http://www.national.com/pf/LM/LM63.html
10 * National Semiconductor LM64
11 Prefix: 'lm64'
12 Addresses scanned: I2C 0x18 and 0x4e
13 Datasheet: Publicly available at the National Semiconductor website
14 http://www.national.com/pf/LM/LM64.html
10 15
11Author: Jean Delvare <khali@linux-fr.org> 16Author: Jean Delvare <khali@linux-fr.org>
12 17
@@ -55,3 +60,5 @@ The lm63 driver will not update its values more frequently than every
55second; reading them more often will do no harm, but will return 'old' 60second; reading them more often will do no harm, but will return 'old'
56values. 61values.
57 62
63The LM64 is effectively an LM63 with GPIO lines. The driver does not
64support these GPIO lines at present.
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85
index a13680871bc7..a76aefeeb68a 100644
--- a/Documentation/hwmon/lm85
+++ b/Documentation/hwmon/lm85
@@ -157,7 +157,7 @@ temperature configuration points:
157 157
158There are three PWM outputs. The LM85 datasheet suggests that the 158There are three PWM outputs. The LM85 datasheet suggests that the
159pwm3 output control both fan3 and fan4. Each PWM can be individually 159pwm3 output control both fan3 and fan4. Each PWM can be individually
160configured and assigned to a zone for it's control value. Each PWM can be 160configured and assigned to a zone for its control value. Each PWM can be
161configured individually according to the following options. 161configured individually according to the following options.
162 162
163* pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off 163* pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off
diff --git a/Documentation/hwmon/ltc4245 b/Documentation/hwmon/ltc4245
index 02838a47d862..86b5880d8502 100644
--- a/Documentation/hwmon/ltc4245
+++ b/Documentation/hwmon/ltc4245
@@ -72,9 +72,7 @@ in6_min_alarm 5v output undervoltage alarm
72in7_min_alarm 3v output undervoltage alarm 72in7_min_alarm 3v output undervoltage alarm
73in8_min_alarm Vee (-12v) output undervoltage alarm 73in8_min_alarm Vee (-12v) output undervoltage alarm
74 74
75in9_input GPIO #1 voltage data 75in9_input GPIO voltage data
76in10_input GPIO #2 voltage data
77in11_input GPIO #3 voltage data
78 76
79power1_input 12v power usage (mW) 77power1_input 12v power usage (mW)
80power2_input 5v power usage (mW) 78power2_input 5v power usage (mW)
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 3de6b0bcb147..d4e2917c6f18 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -80,9 +80,9 @@ All entries (except name) are optional, and should only be created in a
80given driver if the chip has the feature. 80given driver if the chip has the feature.
81 81
82 82
83******** 83*********************
84* Name * 84* Global attributes *
85******** 85*********************
86 86
87name The chip name. 87name The chip name.
88 This should be a short, lowercase string, not containing 88 This should be a short, lowercase string, not containing
@@ -91,6 +91,13 @@ name The chip name.
91 I2C devices get this attribute created automatically. 91 I2C devices get this attribute created automatically.
92 RO 92 RO
93 93
94update_rate The rate at which the chip will update readings.
95 Unit: millisecond
96 RW
97 Some devices have a variable update rate. This attribute
98 can be used to change the update rate to the desired
99 frequency.
100
94 101
95************ 102************
96* Voltages * 103* Voltages *
diff --git a/Documentation/hwmon/tmp102 b/Documentation/hwmon/tmp102
new file mode 100644
index 000000000000..8454a7763122
--- /dev/null
+++ b/Documentation/hwmon/tmp102
@@ -0,0 +1,26 @@
1Kernel driver tmp102
2====================
3
4Supported chips:
5 * Texas Instruments TMP102
6 Prefix: 'tmp102'
7 Addresses scanned: none
8 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp102.html
9
10Author:
11 Steven King <sfking@fdwdc.com>
12
13Description
14-----------
15
16The Texas Instruments TMP102 implements one temperature sensor. Limits can be
17set through the Overtemperature Shutdown register and Hysteresis register. The
18sensor is accurate to 0.5 degree over the range of -25 to +85 C, and to 1.0
19degree from -40 to +125 C. Resolution of the sensor is 0.0625 degree. The
20operating temperature has a minimum of -55 C and a maximum of +150 C.
21
22The TMP102 has a programmable update rate that can select between 8, 4, 1, and
230.5 Hz. (Currently the driver only supports the default of 4 Hz).
24
25The driver provides the common sysfs-interface for temperatures (see
26Documentation/hwmon/sysfs-interface under Temperatures).
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
index e1bb5b261693..e307914a3eda 100644
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -27,7 +27,13 @@ Authors:
27Module Parameters 27Module Parameters
28----------------- 28-----------------
29 29
30None. 30* disable_features (bit vector)
31Disable selected features normally supported by the device. This makes it
32possible to work around possible driver or hardware bugs if the feature in
33question doesn't work as intended for whatever reason. Bit values:
34 1 disable SMBus PEC
35 2 disable the block buffer
36 8 disable the I2C block read functionality
31 37
32 38
33Description 39Description
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt
index 154d767b2acb..8007b7ca87bf 100644
--- a/Documentation/input/joystick.txt
+++ b/Documentation/input/joystick.txt
@@ -402,7 +402,7 @@ for the port of the SoundFusion is supported by the cs461x.c module.
402~~~~~~~~~~~~~~~~~~~~~~~~ 402~~~~~~~~~~~~~~~~~~~~~~~~
403 The Live! has a special PCI gameport, which, although it doesn't provide 403 The Live! has a special PCI gameport, which, although it doesn't provide
404any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than 404any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than
405it's ISA counterparts. It also requires special support, hence the 405its ISA counterparts. It also requires special support, hence the
406emu10k1-gp.c module for it instead of the normal ns558.c one. 406emu10k1-gp.c module for it instead of the normal ns558.c one.
407 407
4083.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes 4083.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f030019..5dc59b04a71f 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -126,7 +126,7 @@ o Tboot then applies an (optional) user-defined launch policy to
126o Tboot adjusts the e820 table provided by the bootloader to reserve 126o Tboot adjusts the e820 table provided by the bootloader to reserve
127 its own location in memory as well as to reserve certain other 127 its own location in memory as well as to reserve certain other
128 TXT-related regions. 128 TXT-related regions.
129o As part of it's launch, tboot DMA protects all of RAM (using the 129o As part of its launch, tboot DMA protects all of RAM (using the
130 VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' 130 VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on'
131 in order to remove this blanket protection and use VT-d's 131 in order to remove this blanket protection and use VT-d's
132 page-level protection. 132 page-level protection.
@@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT
161 has been restored, it will restore the TPM PCRs and then 161 has been restored, it will restore the TPM PCRs and then
162 transfer control back to the kernel's S3 resume vector. 162 transfer control back to the kernel's S3 resume vector.
163 In order to preserve system integrity across S3, the kernel 163 In order to preserve system integrity across S3, the kernel
164 provides tboot with a set of memory ranges (kernel 164 provides tboot with a set of memory ranges (RAM and RESERVED_KERN
165 code/data/bss, S3 resume code, and AP trampoline) that tboot 165 in the e820 table, but not any memory that BIOS might alter over
166 will calculate a MAC (message authentication code) over and then 166 the S3 transition) that tboot will calculate a MAC (message
167 seal with the TPM. On resume and once the measured environment 167 authentication code) over and then seal with the TPM. On resume
168 has been re-established, tboot will re-calculate the MAC and 168 and once the measured environment has been re-established, tboot
169 verify it against the sealed value. Tboot's policy determines 169 will re-calculate the MAC and verify it against the sealed value.
170 what happens if the verification fails. 170 Tboot's policy determines what happens if the verification fails.
171 Note that the c/s 194 of tboot which has the new MAC code supports
172 this.
171 173
172That's pretty much it for TXT support. 174That's pretty much it for TXT support.
173 175
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
index 6f8c1cabbc5d..634c625da8ce 100644
--- a/Documentation/kbuild/kbuild.txt
+++ b/Documentation/kbuild/kbuild.txt
@@ -65,7 +65,7 @@ CROSS_COMPILE
65Specify an optional fixed part of the binutils filename. 65Specify an optional fixed part of the binutils filename.
66CROSS_COMPILE can be a part of the filename or the full path. 66CROSS_COMPILE can be a part of the filename or the full path.
67 67
68CROSS_COMPILE is also used for ccache is some setups. 68CROSS_COMPILE is also used for ccache in some setups.
69 69
70CF 70CF
71-------------------------------------------------- 71--------------------------------------------------
@@ -162,3 +162,7 @@ For tags/TAGS/cscope targets, you can specify more than one arch
162to be included in the databases, separated by blank space. E.g.: 162to be included in the databases, separated by blank space. E.g.:
163 163
164 $ make ALLSOURCE_ARCHS="x86 mips arm" tags 164 $ make ALLSOURCE_ARCHS="x86 mips arm" tags
165
166To get all available archs you can also specify all. E.g.:
167
168 $ make ALLSOURCE_ARCHS=all tags
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index c412c245848f..b472e4e0ba67 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -181,7 +181,7 @@ Expressions are listed in decreasing order of precedence.
181(7) Returns the result of max(/expr/, /expr/). 181(7) Returns the result of max(/expr/, /expr/).
182 182
183An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 183An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
184respectively for calculations). A menu entry becomes visible when it's 184respectively for calculations). A menu entry becomes visible when its
185expression evaluates to 'm' or 'y'. 185expression evaluates to 'm' or 'y'.
186 186
187There are two types of symbols: constant and non-constant symbols. 187There are two types of symbols: constant and non-constant symbols.
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 49efae703979..b2cb16ebcb16 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -96,7 +96,7 @@ Environment variables for 'silentoldconfig'
96KCONFIG_NOSILENTUPDATE 96KCONFIG_NOSILENTUPDATE
97-------------------------------------------------- 97--------------------------------------------------
98If this variable has a non-blank value, it prevents silent kernel 98If this variable has a non-blank value, it prevents silent kernel
99config udpates (requires explicit updates). 99config updates (requires explicit updates).
100 100
101KCONFIG_AUTOCONFIG 101KCONFIG_AUTOCONFIG
102-------------------------------------------------- 102--------------------------------------------------
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 28cdc2af2131..ec8d31ee12e0 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -116,7 +116,7 @@
116 Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza. 116 Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza.
117 URL: http://www.linuxjournal.com/article.php?sid=2391 117 URL: http://www.linuxjournal.com/article.php?sid=2391
118 Keywords: RAID, MD driver. 118 Keywords: RAID, MD driver.
119 Description: Linux Journal Kernel Korner article. Here is it's 119 Description: Linux Journal Kernel Korner article. Here is its
120 abstract: "A description of the implementation of the RAID-1, 120 abstract: "A description of the implementation of the RAID-1,
121 RAID-4 and RAID-5 personalities of the MD device driver in the 121 RAID-4 and RAID-5 personalities of the MD device driver in the
122 Linux kernel, providing users with high performance and reliable, 122 Linux kernel, providing users with high performance and reliable,
@@ -127,7 +127,7 @@
127 URL: http://www.linuxjournal.com/article.php?sid=1219 127 URL: http://www.linuxjournal.com/article.php?sid=1219
128 Keywords: device driver, module, loading/unloading modules, 128 Keywords: device driver, module, loading/unloading modules,
129 allocating resources. 129 allocating resources.
130 Description: Linux Journal Kernel Korner article. Here is it's 130 Description: Linux Journal Kernel Korner article. Here is its
131 abstract: "This is the first of a series of four articles 131 abstract: "This is the first of a series of four articles
132 co-authored by Alessandro Rubini and Georg Zezchwitz which present 132 co-authored by Alessandro Rubini and Georg Zezchwitz which present
133 a practical approach to writing Linux device drivers as kernel 133 a practical approach to writing Linux device drivers as kernel
@@ -141,7 +141,7 @@
141 Keywords: character driver, init_module, clean_up module, 141 Keywords: character driver, init_module, clean_up module,
142 autodetection, mayor number, minor number, file operations, 142 autodetection, mayor number, minor number, file operations,
143 open(), close(). 143 open(), close().
144 Description: Linux Journal Kernel Korner article. Here is it's 144 Description: Linux Journal Kernel Korner article. Here is its
145 abstract: "This article, the second of four, introduces part of 145 abstract: "This article, the second of four, introduces part of
146 the actual code to create custom module implementing a character 146 the actual code to create custom module implementing a character
147 device driver. It describes the code for module initialization and 147 device driver. It describes the code for module initialization and
@@ -152,7 +152,7 @@
152 URL: http://www.linuxjournal.com/article.php?sid=1221 152 URL: http://www.linuxjournal.com/article.php?sid=1221
153 Keywords: read(), write(), select(), ioctl(), blocking/non 153 Keywords: read(), write(), select(), ioctl(), blocking/non
154 blocking mode, interrupt handler. 154 blocking mode, interrupt handler.
155 Description: Linux Journal Kernel Korner article. Here is it's 155 Description: Linux Journal Kernel Korner article. Here is its
156 abstract: "This article, the third of four on writing character 156 abstract: "This article, the third of four on writing character
157 device drivers, introduces concepts of reading, writing, and using 157 device drivers, introduces concepts of reading, writing, and using
158 ioctl-calls". 158 ioctl-calls".
@@ -161,7 +161,7 @@
161 Author: Alessandro Rubini and Georg v. Zezschwitz. 161 Author: Alessandro Rubini and Georg v. Zezschwitz.
162 URL: http://www.linuxjournal.com/article.php?sid=1222 162 URL: http://www.linuxjournal.com/article.php?sid=1222
163 Keywords: interrupts, irqs, DMA, bottom halves, task queues. 163 Keywords: interrupts, irqs, DMA, bottom halves, task queues.
164 Description: Linux Journal Kernel Korner article. Here is it's 164 Description: Linux Journal Kernel Korner article. Here is its
165 abstract: "This is the fourth in a series of articles about 165 abstract: "This is the fourth in a series of articles about
166 writing character device drivers as loadable kernel modules. This 166 writing character device drivers as loadable kernel modules. This
167 month, we further investigate the field of interrupt handling. 167 month, we further investigate the field of interrupt handling.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699a..1808f1157f30 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -58,6 +58,7 @@ parameter is applicable:
58 ISAPNP ISA PnP code is enabled. 58 ISAPNP ISA PnP code is enabled.
59 ISDN Appropriate ISDN support is enabled. 59 ISDN Appropriate ISDN support is enabled.
60 JOY Appropriate joystick support is enabled. 60 JOY Appropriate joystick support is enabled.
61 KGDB Kernel debugger support is enabled.
61 KVM Kernel Virtual Machine support is enabled. 62 KVM Kernel Virtual Machine support is enabled.
62 LIBATA Libata driver is enabled 63 LIBATA Libata driver is enabled
63 LP Printer support is enabled. 64 LP Printer support is enabled.
@@ -99,6 +100,7 @@ parameter is applicable:
99 SWSUSP Software suspend (hibernation) is enabled. 100 SWSUSP Software suspend (hibernation) is enabled.
100 SUSPEND System suspend states are enabled. 101 SUSPEND System suspend states are enabled.
101 FTRACE Function tracing enabled. 102 FTRACE Function tracing enabled.
103 TPM TPM drivers are enabled.
102 TS Appropriate touchscreen support is enabled. 104 TS Appropriate touchscreen support is enabled.
103 UMS USB Mass Storage support is enabled. 105 UMS USB Mass Storage support is enabled.
104 USB USB support is enabled. 106 USB USB support is enabled.
@@ -143,14 +145,14 @@ and is between 256 and 4096 characters. It is defined in the file
143 145
144 acpi= [HW,ACPI,X86] 146 acpi= [HW,ACPI,X86]
145 Advanced Configuration and Power Interface 147 Advanced Configuration and Power Interface
146 Format: { force | off | ht | strict | noirq | rsdt } 148 Format: { force | off | strict | noirq | rsdt }
147 force -- enable ACPI if default was off 149 force -- enable ACPI if default was off
148 off -- disable ACPI if default was on 150 off -- disable ACPI if default was on
149 noirq -- do not use ACPI for IRQ routing 151 noirq -- do not use ACPI for IRQ routing
150 ht -- run only enough ACPI to enable Hyper Threading
151 strict -- Be less tolerant of platforms that are not 152 strict -- Be less tolerant of platforms that are not
152 strictly ACPI specification compliant. 153 strictly ACPI specification compliant.
153 rsdt -- prefer RSDT over (default) XSDT 154 rsdt -- prefer RSDT over (default) XSDT
155 copy_dsdt -- copy DSDT to memory
154 156
155 See also Documentation/power/pm.txt, pci=noacpi 157 See also Documentation/power/pm.txt, pci=noacpi
156 158
@@ -287,9 +289,6 @@ and is between 256 and 4096 characters. It is defined in the file
287 advansys= [HW,SCSI] 289 advansys= [HW,SCSI]
288 See header of drivers/scsi/advansys.c. 290 See header of drivers/scsi/advansys.c.
289 291
290 advwdt= [HW,WDT] Advantech WDT
291 Format: <iostart>,<iostop>
292
293 aedsp16= [HW,OSS] Audio Excel DSP 16 292 aedsp16= [HW,OSS] Audio Excel DSP 16
294 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 293 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
295 See also header of sound/oss/aedsp16.c. 294 See also header of sound/oss/aedsp16.c.
@@ -324,6 +323,8 @@ and is between 256 and 4096 characters. It is defined in the file
324 they are unmapped. Otherwise they are 323 they are unmapped. Otherwise they are
325 flushed before they will be reused, which 324 flushed before they will be reused, which
326 is a lot of faster 325 is a lot of faster
326 off - do not initialize any AMD IOMMU found in
327 the system
327 328
328 amijoy.map= [HW,JOY] Amiga joystick support 329 amijoy.map= [HW,JOY] Amiga joystick support
329 Map of devices attached to JOY0DAT and JOY1DAT 330 Map of devices attached to JOY0DAT and JOY1DAT
@@ -708,6 +709,12 @@ and is between 256 and 4096 characters. It is defined in the file
708 The VGA output is eventually overwritten by the real 709 The VGA output is eventually overwritten by the real
709 console. 710 console.
710 711
712 ekgdboc= [X86,KGDB] Allow early kernel console debugging
713 ekgdboc=kbd
714
715 This is desgined to be used in conjunction with
716 the boot argument: earlyprintk=vga
717
711 eata= [HW,SCSI] 718 eata= [HW,SCSI]
712 719
713 edd= [EDD] 720 edd= [EDD]
@@ -750,13 +757,14 @@ and is between 256 and 4096 characters. It is defined in the file
750 Default value is 0. 757 Default value is 0.
751 Value can be changed at runtime via /selinux/enforce. 758 Value can be changed at runtime via /selinux/enforce.
752 759
760 erst_disable [ACPI]
761 Disable Error Record Serialization Table (ERST)
762 support.
763
753 ether= [HW,NET] Ethernet cards parameters 764 ether= [HW,NET] Ethernet cards parameters
754 This option is obsoleted by the "netdev=" option, which 765 This option is obsoleted by the "netdev=" option, which
755 has equivalent usage. See its documentation for details. 766 has equivalent usage. See its documentation for details.
756 767
757 eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
758 Format: <io>[,<irq>]
759
760 failslab= 768 failslab=
761 fail_page_alloc= 769 fail_page_alloc=
762 fail_make_request=[KNL] 770 fail_make_request=[KNL]
@@ -784,8 +792,12 @@ and is between 256 and 4096 characters. It is defined in the file
784 as early as possible in order to facilitate early 792 as early as possible in order to facilitate early
785 boot debugging. 793 boot debugging.
786 794
787 ftrace_dump_on_oops 795 ftrace_dump_on_oops[=orig_cpu]
788 [FTRACE] will dump the trace buffers on oops. 796 [FTRACE] will dump the trace buffers on oops.
797 If no parameter is passed, ftrace will dump
798 buffers of all CPUs, but if you pass orig_cpu, it will
799 dump only the buffer of the CPU that triggered the
800 oops.
789 801
790 ftrace_filter=[function-list] 802 ftrace_filter=[function-list]
791 [FTRACE] Limit the functions traced by the function 803 [FTRACE] Limit the functions traced by the function
@@ -843,6 +855,11 @@ and is between 256 and 4096 characters. It is defined in the file
843 hd= [EIDE] (E)IDE hard drive subsystem geometry 855 hd= [EIDE] (E)IDE hard drive subsystem geometry
844 Format: <cyl>,<head>,<sect> 856 Format: <cyl>,<head>,<sect>
845 857
858 hest_disable [ACPI]
859 Disable Hardware Error Source Table (HEST) support;
860 corresponding firmware-first mode error processing
861 logic will be disabled.
862
846 highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact 863 highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
847 size of <nn>. This works even on boxes that have no 864 size of <nn>. This works even on boxes that have no
848 highmem otherwise. This also works to reduce highmem 865 highmem otherwise. This also works to reduce highmem
@@ -1112,10 +1129,26 @@ and is between 256 and 4096 characters. It is defined in the file
1112 use the HighMem zone if it exists, and the Normal 1129 use the HighMem zone if it exists, and the Normal
1113 zone if it does not. 1130 zone if it does not.
1114 1131
1115 kgdboc= [HW] kgdb over consoles. 1132 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
1116 Requires a tty driver that supports console polling. 1133 Format: <Controller#>[,poll interval]
1117 (only serial supported for now) 1134 The controller # is the number of the ehci usb debug
1118 Format: <serial_device>[,baud] 1135 port as it is probed via PCI. The poll interval is
1136 optional and is the number seconds in between
1137 each poll cycle to the debug port in case you need
1138 the functionality for interrupting the kernel with
1139 gdb or control-c on the dbgp connection. When
1140 not using this parameter you use sysrq-g to break into
1141 the kernel debugger.
1142
1143 kgdboc= [KGDB,HW] kgdb over consoles.
1144 Requires a tty driver that supports console polling,
1145 or a supported polling keyboard driver (non-usb).
1146 Serial only format: <serial_device>[,baud]
1147 keyboard only format: kbd
1148 keyboard and serial format: kbd,<serial_device>[,baud]
1149
1150 kgdbwait [KGDB] Stop kernel execution and enter the
1151 kernel debugger at the earliest opportunity.
1119 1152
1120 kmac= [MIPS] korina ethernet MAC address. 1153 kmac= [MIPS] korina ethernet MAC address.
1121 Configure the RouterBoard 532 series on-chip 1154 Configure the RouterBoard 532 series on-chip
@@ -1227,6 +1260,8 @@ and is between 256 and 4096 characters. It is defined in the file
1227 * nohrst, nosrst, norst: suppress hard, soft 1260 * nohrst, nosrst, norst: suppress hard, soft
1228 and both resets. 1261 and both resets.
1229 1262
1263 * dump_id: dump IDENTIFY data.
1264
1230 If there are multiple matching configurations changing 1265 If there are multiple matching configurations changing
1231 the same attribute, the last one is used. 1266 the same attribute, the last one is used.
1232 1267
@@ -2236,9 +2271,6 @@ and is between 256 and 4096 characters. It is defined in the file
2236 2271
2237 sched_debug [KNL] Enables verbose scheduler debug messages. 2272 sched_debug [KNL] Enables verbose scheduler debug messages.
2238 2273
2239 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
2240 Format: <io>[,<timeout>[,<isapnp>]]
2241
2242 scsi_debug_*= [SCSI] 2274 scsi_debug_*= [SCSI]
2243 See drivers/scsi/scsi_debug.c. 2275 See drivers/scsi/scsi_debug.c.
2244 2276
@@ -2610,6 +2642,15 @@ and is between 256 and 4096 characters. It is defined in the file
2610 2642
2611 tp720= [HW,PS2] 2643 tp720= [HW,PS2]
2612 2644
2645 tpm_suspend_pcr=[HW,TPM]
2646 Format: integer pcr id
2647 Specify that at suspend time, the tpm driver
2648 should extend the specified pcr with zeros,
2649 as a workaround for some chips which fail to
2650 flush the last written pcr on TPM_SaveState.
2651 This will guarantee that all the other pcrs
2652 are saved.
2653
2613 trace_buf_size=nn[KMG] 2654 trace_buf_size=nn[KMG]
2614 [FTRACE] will set tracing buffer size. 2655 [FTRACE] will set tracing buffer size.
2615 2656
@@ -2818,8 +2859,10 @@ and is between 256 and 4096 characters. It is defined in the file
2818 wd7000= [HW,SCSI] 2859 wd7000= [HW,SCSI]
2819 See header of drivers/scsi/wd7000.c. 2860 See header of drivers/scsi/wd7000.c.
2820 2861
2821 wdt= [WDT] Watchdog 2862 watchdog timers [HW,WDT] For information on watchdog timers,
2822 See Documentation/watchdog/wdt.txt. 2863 see Documentation/watchdog/watchdog-parameters.txt
2864 or other driver-specific files in the
2865 Documentation/watchdog/ directory.
2823 2866
2824 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of 2867 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2825 default x2apic cluster mode on platforms 2868 default x2apic cluster mode on platforms
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 2f9115c0ae62..6653017680dd 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -165,8 +165,8 @@ the user entry_handler invocation is also skipped.
165 165
1661.4 How Does Jump Optimization Work? 1661.4 How Does Jump Optimization Work?
167 167
168If you configured your kernel with CONFIG_OPTPROBES=y (currently 168If your kernel is built with CONFIG_OPTPROBES=y (currently this flag
169this option is supported on x86/x86-64, non-preemptive kernel) and 169is automatically set 'y' on x86/x86-64, non-preemptive kernel) and
170the "debug.kprobes_optimization" kernel parameter is set to 1 (see 170the "debug.kprobes_optimization" kernel parameter is set to 1 (see
171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump 171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
172instruction instead of a breakpoint instruction at each probepoint. 172instruction instead of a breakpoint instruction at each probepoint.
@@ -271,8 +271,6 @@ tweak the kernel's execution path, you need to suppress optimization,
271using one of the following techniques: 271using one of the following techniques:
272- Specify an empty function for the kprobe's post_handler or break_handler. 272- Specify an empty function for the kprobe's post_handler or break_handler.
273 or 273 or
274- Config CONFIG_OPTPROBES=n.
275 or
276- Execute 'sysctl -w debug.kprobes_optimization=n' 274- Execute 'sysctl -w debug.kprobes_optimization=n'
277 275
2782. Architectures Supported 2762. Architectures Supported
@@ -307,10 +305,6 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
307so you can use "objdump -d -l vmlinux" to see the source-to-object 305so you can use "objdump -d -l vmlinux" to see the source-to-object
308code mapping. 306code mapping.
309 307
310If you want to reduce probing overhead, set "Kprobes jump optimization
311support" (CONFIG_OPTPROBES) to "y". You can find this option under the
312"Kprobes" line.
313
3144. API Reference 3084. API Reference
315 309
316The Kprobes API includes a "register" function and an "unregister" 310The Kprobes API includes a "register" function and an "unregister"
@@ -332,7 +326,7 @@ occurs during execution of kp->pre_handler or kp->post_handler,
332or during single-stepping of the probed instruction, Kprobes calls 326or during single-stepping of the probed instruction, Kprobes calls
333kp->fault_handler. Any or all handlers can be NULL. If kp->flags 327kp->fault_handler. Any or all handlers can be NULL. If kp->flags
334is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, 328is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
335so, it's handlers aren't hit until calling enable_kprobe(kp). 329so, its handlers aren't hit until calling enable_kprobe(kp).
336 330
337NOTE: 331NOTE:
3381. With the introduction of the "symbol_name" field to struct kprobe, 3321. With the introduction of the "symbol_name" field to struct kprobe,
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index c6416a398163..a237518e51b9 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
6564.29 KVM_GET_VCPU_EVENTS 6564.29 KVM_GET_VCPU_EVENTS
657 657
658Capability: KVM_CAP_VCPU_EVENTS 658Capability: KVM_CAP_VCPU_EVENTS
659Extended by: KVM_CAP_INTR_SHADOW
659Architectures: x86 660Architectures: x86
660Type: vm ioctl 661Type: vm ioctl
661Parameters: struct kvm_vcpu_event (out) 662Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
676 __u8 injected; 677 __u8 injected;
677 __u8 nr; 678 __u8 nr;
678 __u8 soft; 679 __u8 soft;
679 __u8 pad; 680 __u8 shadow;
680 } interrupt; 681 } interrupt;
681 struct { 682 struct {
682 __u8 injected; 683 __u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
688 __u32 flags; 689 __u32 flags;
689}; 690};
690 691
692KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
693interrupt.shadow contains a valid state. Otherwise, this field is undefined.
694
6914.30 KVM_SET_VCPU_EVENTS 6954.30 KVM_SET_VCPU_EVENTS
692 696
693Capability: KVM_CAP_VCPU_EVENTS 697Capability: KVM_CAP_VCPU_EVENTS
698Extended by: KVM_CAP_INTR_SHADOW
694Architectures: x86 699Architectures: x86
695Type: vm ioctl 700Type: vm ioctl
696Parameters: struct kvm_vcpu_event (in) 701Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,183 @@ current in-kernel state. The bits are:
709KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel 714KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
710KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector 715KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
711 716
717If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
718the flags field to signal that interrupt.shadow contains a valid state and
719shall be written into the VCPU.
720
7214.32 KVM_GET_DEBUGREGS
722
723Capability: KVM_CAP_DEBUGREGS
724Architectures: x86
725Type: vm ioctl
726Parameters: struct kvm_debugregs (out)
727Returns: 0 on success, -1 on error
728
729Reads debug registers from the vcpu.
730
731struct kvm_debugregs {
732 __u64 db[4];
733 __u64 dr6;
734 __u64 dr7;
735 __u64 flags;
736 __u64 reserved[9];
737};
738
7394.33 KVM_SET_DEBUGREGS
740
741Capability: KVM_CAP_DEBUGREGS
742Architectures: x86
743Type: vm ioctl
744Parameters: struct kvm_debugregs (in)
745Returns: 0 on success, -1 on error
746
747Writes debug registers into the vcpu.
748
749See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
750yet and must be cleared on entry.
751
7524.34 KVM_SET_USER_MEMORY_REGION
753
754Capability: KVM_CAP_USER_MEM
755Architectures: all
756Type: vm ioctl
757Parameters: struct kvm_userspace_memory_region (in)
758Returns: 0 on success, -1 on error
759
760struct kvm_userspace_memory_region {
761 __u32 slot;
762 __u32 flags;
763 __u64 guest_phys_addr;
764 __u64 memory_size; /* bytes */
765 __u64 userspace_addr; /* start of the userspace allocated memory */
766};
767
768/* for kvm_memory_region::flags */
769#define KVM_MEM_LOG_DIRTY_PAGES 1UL
770
771This ioctl allows the user to create or modify a guest physical memory
772slot. When changing an existing slot, it may be moved in the guest
773physical memory space, or its flags may be modified. It may not be
774resized. Slots may not overlap in guest physical address space.
775
776Memory for the region is taken starting at the address denoted by the
777field userspace_addr, which must point at user addressable memory for
778the entire memory slot size. Any object may back this memory, including
779anonymous memory, ordinary files, and hugetlbfs.
780
781It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
782be identical. This allows large pages in the guest to be backed by large
783pages in the host.
784
785The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
786instructs kvm to keep track of writes to memory within the slot. See
787the KVM_GET_DIRTY_LOG ioctl.
788
789When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
790region are automatically reflected into the guest. For example, an mmap()
791that affects the region will be made visible immediately. Another example
792is madvise(MADV_DROP).
793
794It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
795The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
796allocation and is deprecated.
797
7984.35 KVM_SET_TSS_ADDR
799
800Capability: KVM_CAP_SET_TSS_ADDR
801Architectures: x86
802Type: vm ioctl
803Parameters: unsigned long tss_address (in)
804Returns: 0 on success, -1 on error
805
806This ioctl defines the physical address of a three-page region in the guest
807physical address space. The region must be within the first 4GB of the
808guest physical address space and must not conflict with any memory slot
809or any mmio address. The guest may malfunction if it accesses this memory
810region.
811
812This ioctl is required on Intel-based hosts. This is needed on Intel hardware
813because of a quirk in the virtualization implementation (see the internals
814documentation when it pops into existence).
815
8164.36 KVM_ENABLE_CAP
817
818Capability: KVM_CAP_ENABLE_CAP
819Architectures: ppc
820Type: vcpu ioctl
821Parameters: struct kvm_enable_cap (in)
822Returns: 0 on success; -1 on error
823
824+Not all extensions are enabled by default. Using this ioctl the application
825can enable an extension, making it available to the guest.
826
827On systems that do not support this ioctl, it always fails. On systems that
828do support it, it only works for extensions that are supported for enablement.
829
830To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should
831be used.
832
833struct kvm_enable_cap {
834 /* in */
835 __u32 cap;
836
837The capability that is supposed to get enabled.
838
839 __u32 flags;
840
841A bitfield indicating future enhancements. Has to be 0 for now.
842
843 __u64 args[4];
844
845Arguments for enabling a feature. If a feature needs initial values to
846function properly, this is the place to put them.
847
848 __u8 pad[64];
849};
850
8514.37 KVM_GET_MP_STATE
852
853Capability: KVM_CAP_MP_STATE
854Architectures: x86, ia64
855Type: vcpu ioctl
856Parameters: struct kvm_mp_state (out)
857Returns: 0 on success; -1 on error
858
859struct kvm_mp_state {
860 __u32 mp_state;
861};
862
863Returns the vcpu's current "multiprocessing state" (though also valid on
864uniprocessor guests).
865
866Possible values are:
867
868 - KVM_MP_STATE_RUNNABLE: the vcpu is currently running
869 - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP)
870 which has not yet received an INIT signal
871 - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is
872 now ready for a SIPI
873 - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and
874 is waiting for an interrupt
875 - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector
876 accesible via KVM_GET_VCPU_EVENTS)
877
878This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
879irqchip, the multiprocessing state must be maintained by userspace.
880
8814.38 KVM_SET_MP_STATE
882
883Capability: KVM_CAP_MP_STATE
884Architectures: x86, ia64
885Type: vcpu ioctl
886Parameters: struct kvm_mp_state (in)
887Returns: 0 on success; -1 on error
888
889Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
890arguments.
891
892This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
893irqchip, the multiprocessing state must be maintained by userspace.
712 894
7135. The kvm_run structure 8955. The kvm_run structure
714 896
@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied
820by kvm. The 'data' member contains the written data if 'is_write' is 1002by kvm. The 'data' member contains the written data if 'is_write' is
821true, and should be filled by application code otherwise. 1003true, and should be filled by application code otherwise.
822 1004
1005NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
1006operations are complete (and guest state is consistent) only after userspace
1007has re-entered the kernel with KVM_RUN. The kernel side will first finish
1008incomplete operations and then check for pending signals. Userspace
1009can re-enter the guest with an unmasked signal pending to complete
1010pending operations.
1011
823 /* KVM_EXIT_HYPERCALL */ 1012 /* KVM_EXIT_HYPERCALL */
824 struct { 1013 struct {
825 __u64 nr; 1014 __u64 nr;
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise.
829 __u32 pad; 1018 __u32 pad;
830 } hypercall; 1019 } hypercall;
831 1020
832Unused. 1021Unused. This was once used for 'hypercall to userspace'. To implement
1022such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
1023Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.
833 1024
834 /* KVM_EXIT_TPR_ACCESS */ 1025 /* KVM_EXIT_TPR_ACCESS */
835 struct { 1026 struct {
@@ -870,6 +1061,19 @@ s390 specific.
870 1061
871powerpc specific. 1062powerpc specific.
872 1063
1064 /* KVM_EXIT_OSI */
1065 struct {
1066 __u64 gprs[32];
1067 } osi;
1068
1069MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch
1070hypercalls and exit with this exit struct that contains all the guest gprs.
1071
1072If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall.
1073Userspace can now handle the hypercall and when it's done modify the gprs as
1074necessary. Upon guest entry all guest GPRs will then be replaced by the values
1075in this struct.
1076
873 /* Fix the size of the union. */ 1077 /* Fix the size of the union. */
874 char padding[256]; 1078 char padding[256];
875 }; 1079 };
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
new file mode 100644
index 000000000000..14a12ea92b7f
--- /dev/null
+++ b/Documentation/kvm/cpuid.txt
@@ -0,0 +1,42 @@
1KVM CPUID bits
2Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
3=====================================================
4
5A guest running on a kvm host, can check some of its features using
6cpuid. This is not always guaranteed to work, since userspace can
7mask-out some, or even all KVM-related cpuid features before launching
8a guest.
9
10KVM cpuid functions are:
11
12function: KVM_CPUID_SIGNATURE (0x40000000)
13returns : eax = 0,
14 ebx = 0x4b4d564b,
15 ecx = 0x564b4d56,
16 edx = 0x4d.
17Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
18This function queries the presence of KVM cpuid leafs.
19
20
21function: define KVM_CPUID_FEATURES (0x40000001)
22returns : ebx, ecx, edx = 0
23 eax = and OR'ed group of (1 << flag), where each flags is:
24
25
26flag || value || meaning
27=============================================================================
28KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs
29 || || 0x11 and 0x12.
30------------------------------------------------------------------------------
31KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays
32 || || on PIO operations.
33------------------------------------------------------------------------------
34KVM_FEATURE_MMU_OP || 2 || deprecated.
35------------------------------------------------------------------------------
36KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
37 || || 0x4b564d00 and 0x4b564d01
38------------------------------------------------------------------------------
39KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
40 || || per-cpu warps are expected in
41 || || kvmclock.
42------------------------------------------------------------------------------
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
new file mode 100644
index 000000000000..aaed6ab9d7ab
--- /dev/null
+++ b/Documentation/kvm/mmu.txt
@@ -0,0 +1,304 @@
1The x86 kvm shadow mmu
2======================
3
4The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible
5for presenting a standard x86 mmu to the guest, while translating guest
6physical addresses to host physical addresses.
7
8The mmu code attempts to satisfy the following requirements:
9
10- correctness: the guest should not be able to determine that it is running
11 on an emulated mmu except for timing (we attempt to comply
12 with the specification, not emulate the characteristics of
13 a particular implementation such as tlb size)
14- security: the guest must not be able to touch host memory not assigned
15 to it
16- performance: minimize the performance penalty imposed by the mmu
17- scaling: need to scale to large memory and large vcpu guests
18- hardware: support the full range of x86 virtualization hardware
19- integration: Linux memory management code must be in control of guest memory
20 so that swapping, page migration, page merging, transparent
21 hugepages, and similar features work without change
22- dirty tracking: report writes to guest memory to enable live migration
23 and framebuffer-based displays
24- footprint: keep the amount of pinned kernel memory low (most memory
25 should be shrinkable)
26- reliablity: avoid multipage or GFP_ATOMIC allocations
27
28Acronyms
29========
30
31pfn host page frame number
32hpa host physical address
33hva host virtual address
34gfn guest frame number
35gpa guest physical address
36gva guest virtual address
37ngpa nested guest physical address
38ngva nested guest virtual address
39pte page table entry (used also to refer generically to paging structure
40 entries)
41gpte guest pte (referring to gfns)
42spte shadow pte (referring to pfns)
43tdp two dimensional paging (vendor neutral term for NPT and EPT)
44
45Virtual and real hardware supported
46===================================
47
48The mmu supports first-generation mmu hardware, which allows an atomic switch
49of the current paging mode and cr3 during guest entry, as well as
50two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware
51it exposes is the traditional 2/3/4 level x86 mmu, with support for global
52pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support
53exposing NPT capable hardware on NPT capable hosts.
54
55Translation
56===========
57
58The primary job of the mmu is to program the processor's mmu to translate
59addresses for the guest. Different translations are required at different
60times:
61
62- when guest paging is disabled, we translate guest physical addresses to
63 host physical addresses (gpa->hpa)
64- when guest paging is enabled, we translate guest virtual addresses, to
65 guest physical addresses, to host physical addresses (gva->gpa->hpa)
66- when the guest launches a guest of its own, we translate nested guest
67 virtual addresses, to nested guest physical addresses, to guest physical
68 addresses, to host physical addresses (ngva->ngpa->gpa->hpa)
69
70The primary challenge is to encode between 1 and 3 translations into hardware
71that support only 1 (traditional) and 2 (tdp) translations. When the
72number of required translations matches the hardware, the mmu operates in
73direct mode; otherwise it operates in shadow mode (see below).
74
75Memory
76======
77
78Guest memory (gpa) is part of the user address space of the process that is
79using kvm. Userspace defines the translation between guest addresses and user
80addresses (gpa->hva); note that two gpas may alias to the same gva, but not
81vice versa.
82
83These gvas may be backed using any method available to the host: anonymous
84memory, file backed memory, and device memory. Memory might be paged by the
85host at any time.
86
87Events
88======
89
90The mmu is driven by events, some from the guest, some from the host.
91
92Guest generated events:
93- writes to control registers (especially cr3)
94- invlpg/invlpga instruction execution
95- access to missing or protected translations
96
97Host generated events:
98- changes in the gpa->hpa translation (either through gpa->hva changes or
99 through hva->hpa changes)
100- memory pressure (the shrinker)
101
102Shadow pages
103============
104
105The principal data structure is the shadow page, 'struct kvm_mmu_page'. A
106shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A
107shadow page may contain a mix of leaf and nonleaf sptes.
108
109A nonleaf spte allows the hardware mmu to reach the leaf pages and
110is not related to a translation directly. It points to other shadow pages.
111
112A leaf spte corresponds to either one or two translations encoded into
113one paging structure entry. These are always the lowest level of the
114translation stack, with optional higher level translations left to NPT/EPT.
115Leaf ptes point at guest pages.
116
117The following table shows translations encoded by leaf ptes, with higher-level
118translations in parentheses:
119
120 Non-nested guests:
121 nonpaging: gpa->hpa
122 paging: gva->gpa->hpa
123 paging, tdp: (gva->)gpa->hpa
124 Nested guests:
125 non-tdp: ngva->gpa->hpa (*)
126 tdp: (ngva->)ngpa->gpa->hpa
127
128(*) the guest hypervisor will encode the ngva->gpa translation into its page
129 tables if npt is not present
130
131Shadow pages contain the following information:
132 role.level:
133 The level in the shadow paging hierarchy that this shadow page belongs to.
134 1=4k sptes, 2=2M sptes, 3=1G sptes, etc.
135 role.direct:
136 If set, leaf sptes reachable from this page are for a linear range.
137 Examples include real mode translation, large guest pages backed by small
138 host pages, and gpa->hpa translations when NPT or EPT is active.
139 The linear range starts at (gfn << PAGE_SHIFT) and its size is determined
140 by role.level (2MB for first level, 1GB for second level, 0.5TB for third
141 level, 256TB for fourth level)
142 If clear, this page corresponds to a guest page table denoted by the gfn
143 field.
144 role.quadrant:
145 When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit
146 sptes. That means a guest page table contains more ptes than the host,
147 so multiple shadow pages are needed to shadow one guest page.
148 For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
149 first or second 512-gpte block in the guest page table. For second-level
150 page tables, each 32-bit gpte is converted to two 64-bit sptes
151 (since each first-level guest page is shadowed by two first-level
152 shadow pages) so role.quadrant takes values in the range 0..3. Each
153 quadrant maps 1GB virtual address space.
154 role.access:
155 Inherited guest access permissions in the form uwx. Note execute
156 permission is positive, not negative.
157 role.invalid:
158 The page is invalid and should not be used. It is a root page that is
159 currently pinned (by a cpu hardware register pointing to it); once it is
160 unpinned it will be destroyed.
161 role.cr4_pae:
162 Contains the value of cr4.pae for which the page is valid (e.g. whether
163 32-bit or 64-bit gptes are in use).
164 role.cr4_nxe:
165 Contains the value of efer.nxe for which the page is valid.
166 role.cr0_wp:
167 Contains the value of cr0.wp for which the page is valid.
168 gfn:
169 Either the guest page table containing the translations shadowed by this
170 page, or the base page frame for linear translations. See role.direct.
171 spt:
172 A pageful of 64-bit sptes containing the translations for this page.
173 Accessed by both kvm and hardware.
174 The page pointed to by spt will have its page->private pointing back
175 at the shadow page structure.
176 sptes in spt point either at guest pages, or at lower-level shadow pages.
177 Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point
178 at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte.
179 The spt array forms a DAG structure with the shadow page as a node, and
180 guest pages as leaves.
181 gfns:
182 An array of 512 guest frame numbers, one for each present pte. Used to
183 perform a reverse map from a pte to a gfn.
184 slot_bitmap:
185 A bitmap containing one bit per memory slot. If the page contains a pte
186 mapping a page from memory slot n, then bit n of slot_bitmap will be set
187 (if a page is aliased among several slots, then it is not guaranteed that
188 all slots will be marked).
189 Used during dirty logging to avoid scanning a shadow page if none if its
190 pages need tracking.
191 root_count:
192 A counter keeping track of how many hardware registers (guest cr3 or
193 pdptrs) are now pointing at the page. While this counter is nonzero, the
194 page cannot be destroyed. See role.invalid.
195 multimapped:
196 Whether there exist multiple sptes pointing at this page.
197 parent_pte/parent_ptes:
198 If multimapped is zero, parent_pte points at the single spte that points at
199 this page's spt. Otherwise, parent_ptes points at a data structure
200 with a list of parent_ptes.
201 unsync:
202 If true, then the translations in this page may not match the guest's
203 translation. This is equivalent to the state of the tlb when a pte is
204 changed but before the tlb entry is flushed. Accordingly, unsync ptes
205 are synchronized when the guest executes invlpg or flushes its tlb by
206 other means. Valid for leaf pages.
207 unsync_children:
208 How many sptes in the page point at pages that are unsync (or have
209 unsynchronized children).
210 unsync_child_bitmap:
211 A bitmap indicating which sptes in spt point (directly or indirectly) at
212 pages that may be unsynchronized. Used to quickly locate all unsychronized
213 pages reachable from a given page.
214
215Reverse map
216===========
217
218The mmu maintains a reverse mapping whereby all ptes mapping a page can be
219reached given its gfn. This is used, for example, when swapping out a page.
220
221Synchronized and unsynchronized pages
222=====================================
223
224The guest uses two events to synchronize its tlb and page tables: tlb flushes
225and page invalidations (invlpg).
226
227A tlb flush means that we need to synchronize all sptes reachable from the
228guest's cr3. This is expensive, so we keep all guest page tables write
229protected, and synchronize sptes to gptes when a gpte is written.
230
231A special case is when a guest page table is reachable from the current
232guest cr3. In this case, the guest is obliged to issue an invlpg instruction
233before using the translation. We take advantage of that by removing write
234protection from the guest page, and allowing the guest to modify it freely.
235We synchronize modified gptes when the guest invokes invlpg. This reduces
236the amount of emulation we have to do when the guest modifies multiple gptes,
237or when the a guest page is no longer used as a page table and is used for
238random guest data.
239
240As a side effect we have to resynchronize all reachable unsynchronized shadow
241pages on a tlb flush.
242
243
244Reaction to events
245==================
246
247- guest page fault (or npt page fault, or ept violation)
248
249This is the most complicated event. The cause of a page fault can be:
250
251 - a true guest fault (the guest translation won't allow the access) (*)
252 - access to a missing translation
253 - access to a protected translation
254 - when logging dirty pages, memory is write protected
255 - synchronized shadow pages are write protected (*)
256 - access to untranslatable memory (mmio)
257
258 (*) not applicable in direct mode
259
260Handling a page fault is performed as follows:
261
262 - if needed, walk the guest page tables to determine the guest translation
263 (gva->gpa or ngpa->gpa)
264 - if permissions are insufficient, reflect the fault back to the guest
265 - determine the host page
266 - if this is an mmio request, there is no host page; call the emulator
267 to emulate the instruction instead
268 - walk the shadow page table to find the spte for the translation,
269 instantiating missing intermediate page tables as necessary
270 - try to unsynchronize the page
271 - if successful, we can let the guest continue and modify the gpte
272 - emulate the instruction
273 - if failed, unshadow the page and let the guest continue
274 - update any translations that were modified by the instruction
275
276invlpg handling:
277
278 - walk the shadow page hierarchy and drop affected translations
279 - try to reinstantiate the indicated translation in the hope that the
280 guest will use it in the near future
281
282Guest control register updates:
283
284- mov to cr3
285 - look up new shadow roots
286 - synchronize newly reachable shadow pages
287
288- mov to cr0/cr4/efer
289 - set up mmu context for new paging mode
290 - look up new shadow roots
291 - synchronize newly reachable shadow pages
292
293Host translation updates:
294
295 - mmu notifier called with updated hva
296 - look up affected sptes through reverse map
297 - drop (or update) translations
298
299Further reading
300===============
301
302- NPT presentation from KVM Forum 2008
303 http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf
304
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
index 2c3c35093023..0bf25eebce94 100644
--- a/Documentation/laptops/laptop-mode.txt
+++ b/Documentation/laptops/laptop-mode.txt
@@ -207,7 +207,7 @@ Tips & Tricks
207* Drew Scott Daniels observed: "I don't know why, but when I decrease the number 207* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
208 of colours that my display uses it consumes less battery power. I've seen 208 of colours that my display uses it consumes less battery power. I've seen
209 this on powerbooks too. I hope that this is a piece of information that 209 this on powerbooks too. I hope that this is a piece of information that
210 might be useful to the Laptop Mode patch or it's users." 210 might be useful to the Laptop Mode patch or its users."
211 211
212* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the 212* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
213 file after every logging. When you're using laptop-mode and your disk doesn't 213 file after every logging. When you're using laptop-mode and your disk doesn't
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 39c0a09d0105..fc15538d8b46 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -292,13 +292,13 @@ sysfs notes:
292 292
293 Warning: when in NVRAM mode, the volume up/down/mute 293 Warning: when in NVRAM mode, the volume up/down/mute
294 keys are synthesized according to changes in the mixer, 294 keys are synthesized according to changes in the mixer,
295 so you have to use volume up or volume down to unmute, 295 which uses a single volume up or volume down hotkey
296 as per the ThinkPad volume mixer user interface. When 296 press to unmute, as per the ThinkPad volume mixer user
297 in ACPI event mode, volume up/down/mute are reported as 297 interface. When in ACPI event mode, volume up/down/mute
298 separate events, but this behaviour may be corrected in 298 events are reported by the firmware and can behave
299 future releases of this driver, in which case the 299 differently (and that behaviour changes with firmware
300 ThinkPad volume mixer user interface semantics will be 300 version -- not just with firmware models -- as well as
301 enforced. 301 OSI(Linux) state).
302 302
303 hotkey_poll_freq: 303 hotkey_poll_freq:
304 frequency in Hz for hot key polling. It must be between 304 frequency in Hz for hot key polling. It must be between
@@ -309,7 +309,7 @@ sysfs notes:
309 will cause hot key presses that require NVRAM polling 309 will cause hot key presses that require NVRAM polling
310 to never be reported. 310 to never be reported.
311 311
312 Setting hotkey_poll_freq too low will cause repeated 312 Setting hotkey_poll_freq too low may cause repeated
313 pressings of the same hot key to be misreported as a 313 pressings of the same hot key to be misreported as a
314 single key press, or to not even be detected at all. 314 single key press, or to not even be detected at all.
315 The recommended polling frequency is 10Hz. 315 The recommended polling frequency is 10Hz.
@@ -397,6 +397,7 @@ ACPI Scan
397event code Key Notes 397event code Key Notes
398 398
3990x1001 0x00 FN+F1 - 3990x1001 0x00 FN+F1 -
400
4000x1002 0x01 FN+F2 IBM: battery (rare) 4010x1002 0x01 FN+F2 IBM: battery (rare)
401 Lenovo: Screen lock 402 Lenovo: Screen lock
402 403
@@ -404,7 +405,8 @@ event code Key Notes
404 this hot key, even with hot keys 405 this hot key, even with hot keys
405 disabled or with Fn+F3 masked 406 disabled or with Fn+F3 masked
406 off 407 off
407 IBM: screen lock 408 IBM: screen lock, often turns
409 off the ThinkLight as side-effect
408 Lenovo: battery 410 Lenovo: battery
409 411
4100x1004 0x03 FN+F4 Sleep button (ACPI sleep button 4120x1004 0x03 FN+F4 Sleep button (ACPI sleep button
@@ -433,7 +435,8 @@ event code Key Notes
433 Do you feel lucky today? 435 Do you feel lucky today?
434 436
4350x1008 0x07 FN+F8 IBM: toggle screen expand 4370x1008 0x07 FN+F8 IBM: toggle screen expand
436 Lenovo: configure UltraNav 438 Lenovo: configure UltraNav,
439 or toggle screen expand
437 440
4380x1009 0x08 FN+F9 - 4410x1009 0x08 FN+F9 -
439 .. .. .. 442 .. .. ..
@@ -444,7 +447,7 @@ event code Key Notes
444 either through the ACPI event, 447 either through the ACPI event,
445 or through a hotkey event. 448 or through a hotkey event.
446 The firmware may refuse to 449 The firmware may refuse to
447 generate further FN+F4 key 450 generate further FN+F12 key
448 press events until a S3 or S4 451 press events until a S3 or S4
449 ACPI sleep cycle is performed, 452 ACPI sleep cycle is performed,
450 or some time passes. 453 or some time passes.
@@ -512,15 +515,19 @@ events for switches:
512SW_RFKILL_ALL T60 and later hardware rfkill rocker switch 515SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
513SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A 516SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
514 517
515Non hot-key ACPI HKEY event map: 518Non hotkey ACPI HKEY event map:
519-------------------------------
520
521Events that are not propagated by the driver, except for legacy
522compatibility purposes when hotkey_report_mode is set to 1:
523
5160x5001 Lid closed 5240x5001 Lid closed
5170x5002 Lid opened 5250x5002 Lid opened
5180x5009 Tablet swivel: switched to tablet mode 5260x5009 Tablet swivel: switched to tablet mode
5190x500A Tablet swivel: switched to normal mode 5270x500A Tablet swivel: switched to normal mode
5200x7000 Radio Switch may have changed state 5280x7000 Radio Switch may have changed state
521 529
522The above events are not propagated by the driver, except for legacy 530Events that are never propagated by the driver:
523compatibility purposes when hotkey_report_mode is set to 1.
524 531
5250x2304 System is waking up from suspend to undock 5320x2304 System is waking up from suspend to undock
5260x2305 System is waking up from suspend to eject bay 5330x2305 System is waking up from suspend to eject bay
@@ -528,14 +535,39 @@ compatibility purposes when hotkey_report_mode is set to 1.
5280x2405 System is waking up from hibernation to eject bay 5350x2405 System is waking up from hibernation to eject bay
5290x5010 Brightness level changed/control event 5360x5010 Brightness level changed/control event
530 537
531The above events are never propagated by the driver. 538Events that are propagated by the driver to userspace:
532 539
5400x2313 ALARM: System is waking up from suspend because
541 the battery is nearly empty
5420x2413 ALARM: System is waking up from hibernation because
543 the battery is nearly empty
5330x3003 Bay ejection (see 0x2x05) complete, can sleep again 5440x3003 Bay ejection (see 0x2x05) complete, can sleep again
5450x3006 Bay hotplug request (hint to power up SATA link when
546 the optical drive tray is ejected)
5340x4003 Undocked (see 0x2x04), can sleep again 5470x4003 Undocked (see 0x2x04), can sleep again
5350x500B Tablet pen inserted into its storage bay 5480x500B Tablet pen inserted into its storage bay
5360x500C Tablet pen removed from its storage bay 5490x500C Tablet pen removed from its storage bay
537 5500x6011 ALARM: battery is too hot
538The above events are propagated by the driver. 5510x6012 ALARM: battery is extremely hot
5520x6021 ALARM: a sensor is too hot
5530x6022 ALARM: a sensor is extremely hot
5540x6030 System thermal table changed
555
556Battery nearly empty alarms are a last resort attempt to get the
557operating system to hibernate or shutdown cleanly (0x2313), or shutdown
558cleanly (0x2413) before power is lost. They must be acted upon, as the
559wake up caused by the firmware will have negated most safety nets...
560
561When any of the "too hot" alarms happen, according to Lenovo the user
562should suspend or hibernate the laptop (and in the case of battery
563alarms, unplug the AC adapter) to let it cool down. These alarms do
564signal that something is wrong, they should never happen on normal
565operating conditions.
566
567The "extremely hot" alarms are emergencies. According to Lenovo, the
568operating system is to force either an immediate suspend or hibernate
569cycle, or a system shutdown. Obviously, something is very wrong if this
570happens.
539 571
540Compatibility notes: 572Compatibility notes:
541 573
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 3119f5db75bd..e9ce3c554514 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -263,7 +263,7 @@ static u8 *get_feature_bits(struct device *dev)
263 * Launcher virtual with an offset. 263 * Launcher virtual with an offset.
264 * 264 *
265 * This can be tough to get your head around, but usually it just means that we 265 * This can be tough to get your head around, but usually it just means that we
266 * use these trivial conversion functions when the Guest gives us it's 266 * use these trivial conversion functions when the Guest gives us its
267 * "physical" addresses: 267 * "physical" addresses:
268 */ 268 */
269static void *from_guest_phys(unsigned long addr) 269static void *from_guest_phys(unsigned long addr)
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 188f4768f1d5..e4e893ef3e01 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -136,7 +136,7 @@ raid_disks != 0.
136 136
137Then uninitialized devices can be added with ADD_NEW_DISK. The 137Then uninitialized devices can be added with ADD_NEW_DISK. The
138structure passed to ADD_NEW_DISK must specify the state of the device 138structure passed to ADD_NEW_DISK must specify the state of the device
139and it's role in the array. 139and its role in the array.
140 140
141Once started with RUN_ARRAY, uninitialized spares can be added with 141Once started with RUN_ARRAY, uninitialized spares can be added with
142HOT_ADD_DISK. 142HOT_ADD_DISK.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
index aa60d1f627e5..c91ccc0720fa 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/mutex-design.txt
@@ -66,14 +66,14 @@ of advantages of mutexes:
66 66
67 c0377ccb <mutex_lock>: 67 c0377ccb <mutex_lock>:
68 c0377ccb: f0 ff 08 lock decl (%eax) 68 c0377ccb: f0 ff 08 lock decl (%eax)
69 c0377cce: 78 0e js c0377cde <.text.lock.mutex> 69 c0377cce: 78 0e js c0377cde <.text..lock.mutex>
70 c0377cd0: c3 ret 70 c0377cd0: c3 ret
71 71
72 the unlocking fastpath is equally tight: 72 the unlocking fastpath is equally tight:
73 73
74 c0377cd1 <mutex_unlock>: 74 c0377cd1 <mutex_unlock>:
75 c0377cd1: f0 ff 00 lock incl (%eax) 75 c0377cd1: f0 ff 00 lock incl (%eax)
76 c0377cd4: 7e 0f jle c0377ce5 <.text.lock.mutex+0x7> 76 c0377cd4: 7e 0f jle c0377ce5 <.text..lock.mutex+0x7>
77 c0377cd6: c3 ret 77 c0377cd6: c3 ret
78 78
79 - 'struct mutex' semantics are well-defined and are enforced if 79 - 'struct mutex' semantics are well-defined and are enforced if
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt
index 98dd9f7430f2..638c74f7de7f 100644
--- a/Documentation/netlabel/lsm_interface.txt
+++ b/Documentation/netlabel/lsm_interface.txt
@@ -38,7 +38,7 @@ Depending on the exact configuration, translation between the network packet
38label and the internal LSM security identifier can be time consuming. The 38label and the internal LSM security identifier can be time consuming. The
39NetLabel label mapping cache is a caching mechanism which can be used to 39NetLabel label mapping cache is a caching mechanism which can be used to
40sidestep much of this overhead once a mapping has been established. Once the 40sidestep much of this overhead once a mapping has been established. Once the
41LSM has received a packet, used NetLabel to decode it's security attributes, 41LSM has received a packet, used NetLabel to decode its security attributes,
42and translated the security attributes into a LSM internal identifier the LSM 42and translated the security attributes into a LSM internal identifier the LSM
43can use the NetLabel caching functions to associate the LSM internal 43can use the NetLabel caching functions to associate the LSM internal
44identifier with the network packet's label. This means that in the future 44identifier with the network packet's label. This means that in the future
diff --git a/Documentation/networking/caif/Linux-CAIF.txt b/Documentation/networking/caif/Linux-CAIF.txt
new file mode 100644
index 000000000000..7fe7a9a33a4f
--- /dev/null
+++ b/Documentation/networking/caif/Linux-CAIF.txt
@@ -0,0 +1,212 @@
1Linux CAIF
2===========
3copyright (C) ST-Ericsson AB 2010
4Author: Sjur Brendeland/ sjur.brandeland@stericsson.com
5License terms: GNU General Public License (GPL) version 2
6
7
8Introduction
9------------
10CAIF is a MUX protocol used by ST-Ericsson cellular modems for
11communication between Modem and host. The host processes can open virtual AT
12channels, initiate GPRS Data connections, Video channels and Utility Channels.
13The Utility Channels are general purpose pipes between modem and host.
14
15ST-Ericsson modems support a number of transports between modem
16and host. Currently, UART and Loopback are available for Linux.
17
18
19Architecture:
20------------
21The implementation of CAIF is divided into:
22* CAIF Socket Layer, Kernel API, and Net Device.
23* CAIF Core Protocol Implementation
24* CAIF Link Layer, implemented as NET devices.
25
26
27 RTNL
28 !
29 ! +------+ +------+ +------+
30 ! +------+! +------+! +------+!
31 ! ! Sock !! !Kernel!! ! Net !!
32 ! ! API !+ ! API !+ ! Dev !+ <- CAIF Client APIs
33 ! +------+ +------! +------+
34 ! ! ! !
35 ! +----------!----------+
36 ! +------+ <- CAIF Protocol Implementation
37 +-------> ! CAIF !
38 ! Core !
39 +------+
40 +--------!--------+
41 ! !
42 +------+ +-----+
43 ! ! ! TTY ! <- Link Layer (Net Devices)
44 +------+ +-----+
45
46
47Using the Kernel API
48----------------------
49The Kernel API is used for accessing CAIF channels from the
50kernel.
51The user of the API has to implement two callbacks for receive
52and control.
53The receive callback gives a CAIF packet as a SKB. The control
54callback will
55notify of channel initialization complete, and flow-on/flow-
56off.
57
58
59 struct caif_device caif_dev = {
60 .caif_config = {
61 .name = "MYDEV"
62 .type = CAIF_CHTY_AT
63 }
64 .receive_cb = my_receive,
65 .control_cb = my_control,
66 };
67 caif_add_device(&caif_dev);
68 caif_transmit(&caif_dev, skb);
69
70See the caif_kernel.h for details about the CAIF kernel API.
71
72
73I M P L E M E N T A T I O N
74===========================
75===========================
76
77CAIF Core Protocol Layer
78=========================================
79
80CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson.
81It implements the CAIF protocol stack in a layered approach, where
82each layer described in the specification is implemented as a separate layer.
83The architecture is inspired by the design patterns "Protocol Layer" and
84"Protocol Packet".
85
86== CAIF structure ==
87The Core CAIF implementation contains:
88 - Simple implementation of CAIF.
89 - Layered architecture (a la Streams), each layer in the CAIF
90 specification is implemented in a separate c-file.
91 - Clients must implement PHY layer to access physical HW
92 with receive and transmit functions.
93 - Clients must call configuration function to add PHY layer.
94 - Clients must implement CAIF layer to consume/produce
95 CAIF payload with receive and transmit functions.
96 - Clients must call configuration function to add and connect the
97 Client layer.
98 - When receiving / transmitting CAIF Packets (cfpkt), ownership is passed
99 to the called function (except for framing layers' receive functions
100 or if a transmit function returns an error, in which case the caller
101 must free the packet).
102
103Layered Architecture
104--------------------
105The CAIF protocol can be divided into two parts: Support functions and Protocol
106Implementation. The support functions include:
107
108 - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The
109 CAIF Packet has functions for creating, destroying and adding content
110 and for adding/extracting header and trailers to protocol packets.
111
112 - CFLST CAIF list implementation.
113
114 - CFGLUE CAIF Glue. Contains OS Specifics, such as memory
115 allocation, endianness, etc.
116
117The CAIF Protocol implementation contains:
118
119 - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol
120 Stack and provides a Client interface for adding Link-Layer and
121 Driver interfaces on top of the CAIF Stack.
122
123 - CFCTRL CAIF Control layer. Encodes and Decodes control messages
124 such as enumeration and channel setup. Also matches request and
125 response messages.
126
127 - CFSERVL General CAIF Service Layer functionality; handles flow
128 control and remote shutdown requests.
129
130 - CFVEI CAIF VEI layer. Handles CAIF AT Channels on VEI (Virtual
131 External Interface). This layer encodes/decodes VEI frames.
132
133 - CFDGML CAIF Datagram layer. Handles CAIF Datagram layer (IP
134 traffic), encodes/decodes Datagram frames.
135
136 - CFMUX CAIF Mux layer. Handles multiplexing between multiple
137 physical bearers and multiple channels such as VEI, Datagram, etc.
138 The MUX keeps track of the existing CAIF Channels and
139 Physical Instances and selects the apropriate instance based
140 on Channel-Id and Physical-ID.
141
142 - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length
143 and frame checksum.
144
145 - CFSERL CAIF Serial layer. Handles concatenation/split of frames
146 into CAIF Frames with correct length.
147
148
149
150 +---------+
151 | Config |
152 | CFCNFG |
153 +---------+
154 !
155 +---------+ +---------+ +---------+
156 | AT | | Control | | Datagram|
157 | CFVEIL | | CFCTRL | | CFDGML |
158 +---------+ +---------+ +---------+
159 \_____________!______________/
160 !
161 +---------+
162 | MUX |
163 | |
164 +---------+
165 _____!_____
166 / \
167 +---------+ +---------+
168 | CFFRML | | CFFRML |
169 | Framing | | Framing |
170 +---------+ +---------+
171 ! !
172 +---------+ +---------+
173 | | | Serial |
174 | | | CFSERL |
175 +---------+ +---------+
176
177
178In this layered approach the following "rules" apply.
179 - All layers embed the same structure "struct cflayer"
180 - A layer does not depend on any other layer's private data.
181 - Layers are stacked by setting the pointers
182 layer->up , layer->dn
183 - In order to send data upwards, each layer should do
184 layer->up->receive(layer->up, packet);
185 - In order to send data downwards, each layer should do
186 layer->dn->transmit(layer->dn, packet);
187
188
189Linux Driver Implementation
190===========================
191
192Linux GPRS Net Device and CAIF socket are implemented on top of the
193CAIF Core protocol. The Net device and CAIF socket have an instance of
194'struct cflayer', just like the CAIF Core protocol stack.
195Net device and Socket implement the 'receive()' function defined by
196'struct cflayer', just like the rest of the CAIF stack. In this way, transmit and
197receive of packets is handled as by the rest of the layers: the 'dn->transmit()'
198function is called in order to transmit data.
199
200The layer on top of the CAIF Core implementation is
201sometimes referred to as the "Client layer".
202
203
204Configuration of Link Layer
205---------------------------
206The Link Layer is implemented as Linux net devices (struct net_device).
207Payload handling and registration is done using standard Linux mechanisms.
208
209The CAIF Protocol relies on a loss-less link layer without implementing
210retransmission. This implies that packet drops must not happen.
211Therefore a flow-control mechanism is implemented where the physical
212interface can initiate flow stop for all CAIF Channels.
diff --git a/Documentation/networking/caif/README b/Documentation/networking/caif/README
new file mode 100644
index 000000000000..757ccfaa1385
--- /dev/null
+++ b/Documentation/networking/caif/README
@@ -0,0 +1,109 @@
1Copyright (C) ST-Ericsson AB 2010
2Author: Sjur Brendeland/ sjur.brandeland@stericsson.com
3License terms: GNU General Public License (GPL) version 2
4---------------------------------------------------------
5
6=== Start ===
7If you have compiled CAIF for modules do:
8
9$modprobe crc_ccitt
10$modprobe caif
11$modprobe caif_socket
12$modprobe chnl_net
13
14
15=== Preparing the setup with a STE modem ===
16
17If you are working on integration of CAIF you should make sure
18that the kernel is built with module support.
19
20There are some things that need to be tweaked to get the host TTY correctly
21set up to talk to the modem.
22Since the CAIF stack is running in the kernel and we want to use the existing
23TTY, we are installing our physical serial driver as a line discipline above
24the TTY device.
25
26To achieve this we need to install the N_CAIF ldisc from user space.
27The benefit is that we can hook up to any TTY.
28
29The use of Start-of-frame-extension (STX) must also be set as
30module parameter "ser_use_stx".
31
32Normally Frame Checksum is always used on UART, but this is also provided as a
33module parameter "ser_use_fcs".
34
35$ modprobe caif_serial ser_ttyname=/dev/ttyS0 ser_use_stx=yes
36$ ifconfig caif_ttyS0 up
37
38PLEASE NOTE: There is a limitation in Android shell.
39 It only accepts one argument to insmod/modprobe!
40
41=== Trouble shooting ===
42
43There are debugfs parameters provided for serial communication.
44/sys/kernel/debug/caif_serial/<tty-name>/
45
46* ser_state: Prints the bit-mask status where
47 - 0x02 means SENDING, this is a transient state.
48 - 0x10 means FLOW_OFF_SENT, i.e. the previous frame has not been sent
49 and is blocking further send operation. Flow OFF has been propagated
50 to all CAIF Channels using this TTY.
51
52* tty_status: Prints the bit-mask tty status information
53 - 0x01 - tty->warned is on.
54 - 0x02 - tty->low_latency is on.
55 - 0x04 - tty->packed is on.
56 - 0x08 - tty->flow_stopped is on.
57 - 0x10 - tty->hw_stopped is on.
58 - 0x20 - tty->stopped is on.
59
60* last_tx_msg: Binary blob Prints the last transmitted frame.
61 This can be printed with
62 $od --format=x1 /sys/kernel/debug/caif_serial/<tty>/last_rx_msg.
63 The first two tx messages sent look like this. Note: The initial
64 byte 02 is start of frame extension (STX) used for re-syncing
65 upon errors.
66
67 - Enumeration:
68 0000000 02 05 00 00 03 01 d2 02
69 | | | | | |
70 STX(1) | | | |
71 Length(2)| | |
72 Control Channel(1)
73 Command:Enumeration(1)
74 Link-ID(1)
75 Checksum(2)
76 - Channel Setup:
77 0000000 02 07 00 00 00 21 a1 00 48 df
78 | | | | | | | |
79 STX(1) | | | | | |
80 Length(2)| | | | |
81 Control Channel(1)
82 Command:Channel Setup(1)
83 Channel Type(1)
84 Priority and Link-ID(1)
85 Endpoint(1)
86 Checksum(2)
87
88* last_rx_msg: Prints the last transmitted frame.
89 The RX messages for LinkSetup look almost identical but they have the
90 bit 0x20 set in the command bit, and Channel Setup has added one byte
91 before Checksum containing Channel ID.
92 NOTE: Several CAIF Messages might be concatenated. The maximum debug
93 buffer size is 128 bytes.
94
95== Error Scenarios:
96- last_tx_msg contains channel setup message and last_rx_msg is empty ->
97 The host seems to be able to send over the UART, at least the CAIF ldisc get
98 notified that sending is completed.
99
100- last_tx_msg contains enumeration message and last_rx_msg is empty ->
101 The host is not able to send the message from UART, the tty has not been
102 able to complete the transmit operation.
103
104- if /sys/kernel/debug/caif_serial/<tty>/tty_status is non-zero there
105 might be problems transmitting over UART.
106 E.g. host and modem wiring is not correct you will typically see
107 tty_status = 0x10 (hw_stopped) and ser_state = 0x10 (FLOW_OFF_SENT).
108 You will probably see the enumeration message in last_tx_message
109 and empty last_rx_message.
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 1b96ccda3836..2bac9618c345 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -756,7 +756,7 @@ static int enslave(char *master_ifname, char *slave_ifname)
756 */ 756 */
757 if (abi_ver < 1) { 757 if (abi_ver < 1) {
758 /* For old ABI, the master needs to be 758 /* For old ABI, the master needs to be
759 * down before setting it's hwaddr 759 * down before setting its hwaddr
760 */ 760 */
761 res = set_if_down(master_ifname, master_flags.ifr_flags); 761 res = set_if_down(master_ifname, master_flags.ifr_flags);
762 if (res) { 762 if (res) {
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8b72c88ba213..d0536b5a4e01 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
588 (i.e. by default) range 1024-4999 is enough to issue up to 588 (i.e. by default) range 1024-4999 is enough to issue up to
589 2000 connections per second to systems supporting timestamps. 589 2000 connections per second to systems supporting timestamps.
590 590
591ip_local_reserved_ports - list of comma separated ranges
592 Specify the ports which are reserved for known third-party
593 applications. These ports will not be used by automatic port
594 assignments (e.g. when calling connect() or bind() with port
595 number 0). Explicit port allocation behavior is unchanged.
596
597 The format used for both input and output is a comma separated
598 list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
599 10). Writing to the file will clear all previously reserved
600 ports and update the current list with the one given in the
601 input.
602
603 Note that ip_local_port_range and ip_local_reserved_ports
604 settings are independent and both are considered by the kernel
605 when determining which ports are available for automatic port
606 assignments.
607
608 You can reserve ports which are not in the current
609 ip_local_port_range, e.g.:
610
611 $ cat /proc/sys/net/ipv4/ip_local_port_range
612 32000 61000
613 $ cat /proc/sys/net/ipv4/ip_local_reserved_ports
614 8080,9148
615
616 although this is redundant. However such a setting is useful
617 if later the port range is changed to a value that will
618 include the reserved ports.
619
620 Default: Empty
621
591ip_nonlocal_bind - BOOLEAN 622ip_nonlocal_bind - BOOLEAN
592 If set, allows processes to bind() to non-local IP addresses, 623 If set, allows processes to bind() to non-local IP addresses,
593 which can be quite useful - but may break some applications. 624 which can be quite useful - but may break some applications.
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt
index 63214b280e00..e7bf3979facb 100644
--- a/Documentation/networking/l2tp.txt
+++ b/Documentation/networking/l2tp.txt
@@ -1,44 +1,95 @@
1This brief document describes how to use the kernel's PPPoL2TP driver 1This document describes how to use the kernel's L2TP drivers to
2to provide L2TP functionality. L2TP is a protocol that tunnels one or 2provide L2TP functionality. L2TP is a protocol that tunnels one or
3more PPP sessions over a UDP tunnel. It is commonly used for VPNs 3more sessions over an IP tunnel. It is commonly used for VPNs
4(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP 4(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP
5network infrastructure. 5network infrastructure. With L2TPv3, it is also useful as a Layer-2
6tunneling infrastructure.
7
8Features
9========
10
11L2TPv2 (PPP over L2TP (UDP tunnels)).
12L2TPv3 ethernet pseudowires.
13L2TPv3 PPP pseudowires.
14L2TPv3 IP encapsulation.
15Netlink sockets for L2TPv3 configuration management.
16
17History
18=======
19
20The original pppol2tp driver was introduced in 2.6.23 and provided
21L2TPv2 functionality (rfc2661). L2TPv2 is used to tunnel one or more PPP
22sessions over a UDP tunnel.
23
24L2TPv3 (rfc3931) changes the protocol to allow different frame types
25to be passed over an L2TP tunnel by moving the PPP-specific parts of
26the protocol out of the core L2TP packet headers. Each frame type is
27known as a pseudowire type. Ethernet, PPP, HDLC, Frame Relay and ATM
28pseudowires for L2TP are defined in separate RFC standards. Another
29change for L2TPv3 is that it can be carried directly over IP with no
30UDP header (UDP is optional). It is also possible to create static
31unmanaged L2TPv3 tunnels manually without a control protocol
32(userspace daemon) to manage them.
33
34To support L2TPv3, the original pppol2tp driver was split up to
35separate the L2TP and PPP functionality. Existing L2TPv2 userspace
36apps should be unaffected as the original pppol2tp sockets API is
37retained. L2TPv3, however, uses netlink to manage L2TPv3 tunnels and
38sessions.
6 39
7Design 40Design
8====== 41======
9 42
10The PPPoL2TP driver, drivers/net/pppol2tp.c, provides a mechanism by 43The L2TP protocol separates control and data frames. The L2TP kernel
11which PPP frames carried through an L2TP session are passed through 44drivers handle only L2TP data frames; control frames are always
12the kernel's PPP subsystem. The standard PPP daemon, pppd, handles all 45handled by userspace. L2TP control frames carry messages between L2TP
13PPP interaction with the peer. PPP network interfaces are created for 46clients/servers and are used to setup / teardown tunnels and
14each local PPP endpoint. 47sessions. An L2TP client or server is implemented in userspace.
15 48
16The L2TP protocol http://www.faqs.org/rfcs/rfc2661.html defines L2TP 49Each L2TP tunnel is implemented using a UDP or L2TPIP socket; L2TPIP
17control and data frames. L2TP control frames carry messages between 50provides L2TPv3 IP encapsulation (no UDP) and is implemented using a
18L2TP clients/servers and are used to setup / teardown tunnels and 51new l2tpip socket family. The tunnel socket is typically created by
19sessions. An L2TP client or server is implemented in userspace and 52userspace, though for unmanaged L2TPv3 tunnels, the socket can also be
20will use a regular UDP socket per tunnel. L2TP data frames carry PPP 53created by the kernel. Each L2TP session (pseudowire) gets a network
21frames, which may be PPP control or PPP data. The kernel's PPP 54interface instance. In the case of PPP, these interfaces are created
55indirectly by pppd using a pppol2tp socket. In the case of ethernet,
56the netdevice is created upon a netlink request to create an L2TPv3
57ethernet pseudowire.
58
59For PPP, the PPPoL2TP driver, net/l2tp/l2tp_ppp.c, provides a
60mechanism by which PPP frames carried through an L2TP session are
61passed through the kernel's PPP subsystem. The standard PPP daemon,
62pppd, handles all PPP interaction with the peer. PPP network
63interfaces are created for each local PPP endpoint. The kernel's PPP
22subsystem arranges for PPP control frames to be delivered to pppd, 64subsystem arranges for PPP control frames to be delivered to pppd,
23while data frames are forwarded as usual. 65while data frames are forwarded as usual.
24 66
67For ethernet, the L2TPETH driver, net/l2tp/l2tp_eth.c, implements a
68netdevice driver, managing virtual ethernet devices, one per
69pseudowire. These interfaces can be managed using standard Linux tools
70such as "ip" and "ifconfig". If only IP frames are passed over the
71tunnel, the interface can be given an IP addresses of itself and its
72peer. If non-IP frames are to be passed over the tunnel, the interface
73can be added to a bridge using brctl. All L2TP datapath protocol
74functions are handled by the L2TP core driver.
75
25Each tunnel and session within a tunnel is assigned a unique tunnel_id 76Each tunnel and session within a tunnel is assigned a unique tunnel_id
26and session_id. These ids are carried in the L2TP header of every 77and session_id. These ids are carried in the L2TP header of every
27control and data packet. The pppol2tp driver uses them to lookup 78control and data packet. (Actually, in L2TPv3, the tunnel_id isn't
28internal tunnel and/or session contexts. Zero tunnel / session ids are 79present in data frames - it is inferred from the IP connection on
29treated specially - zero ids are never assigned to tunnels or sessions 80which the packet was received.) The L2TP driver uses the ids to lookup
30in the network. In the driver, the tunnel context keeps a pointer to 81internal tunnel and/or session contexts to determine how to handle the
31the tunnel UDP socket. The session context keeps a pointer to the 82packet. Zero tunnel / session ids are treated specially - zero ids are
32PPPoL2TP socket, as well as other data that lets the driver interface 83never assigned to tunnels or sessions in the network. In the driver,
33to the kernel PPP subsystem. 84the tunnel context keeps a reference to the tunnel UDP or L2TPIP
34 85socket. The session context holds data that lets the driver interface
35Note that the pppol2tp kernel driver handles only L2TP data frames; 86to the kernel's network frame type subsystems, i.e. PPP, ethernet.
36L2TP control frames are simply passed up to userspace in the UDP 87
37tunnel socket. The kernel handles all datapath aspects of the 88Userspace Programming
38protocol, including data packet resequencing (if enabled). 89=====================
39 90
40There are a number of requirements on the userspace L2TP daemon in 91For L2TPv2, there are a number of requirements on the userspace L2TP
41order to use the pppol2tp driver. 92daemon in order to use the pppol2tp driver.
42 93
431. Use a UDP socket per tunnel. 941. Use a UDP socket per tunnel.
44 95
@@ -86,6 +137,35 @@ In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided
86to retrieve tunnel and session statistics from the kernel using the 137to retrieve tunnel and session statistics from the kernel using the
87PPPoX socket of the appropriate tunnel or session. 138PPPoX socket of the appropriate tunnel or session.
88 139
140For L2TPv3, userspace must use the netlink API defined in
141include/linux/l2tp.h to manage tunnel and session contexts. The
142general procedure to create a new L2TP tunnel with one session is:-
143
1441. Open a GENL socket using L2TP_GENL_NAME for configuring the kernel
145 using netlink.
146
1472. Create a UDP or L2TPIP socket for the tunnel.
148
1493. Create a new L2TP tunnel using a L2TP_CMD_TUNNEL_CREATE
150 request. Set attributes according to desired tunnel parameters,
151 referencing the UDP or L2TPIP socket created in the previous step.
152
1534. Create a new L2TP session in the tunnel using a
154 L2TP_CMD_SESSION_CREATE request.
155
156The tunnel and all of its sessions are closed when the tunnel socket
157is closed. The netlink API may also be used to delete sessions and
158tunnels. Configuration and status info may be set or read using netlink.
159
160The L2TP driver also supports static (unmanaged) L2TPv3 tunnels. These
161are where there is no L2TP control message exchange with the peer to
162setup the tunnel; the tunnel is configured manually at each end of the
163tunnel. There is no need for an L2TP userspace application in this
164case -- the tunnel socket is created by the kernel and configured
165using parameters sent in the L2TP_CMD_TUNNEL_CREATE netlink
166request. The "ip" utility of iproute2 has commands for managing static
167L2TPv3 tunnels; do "ip l2tp help" for more information.
168
89Debugging 169Debugging
90========= 170=========
91 171
@@ -102,6 +182,69 @@ PPPOL2TP_MSG_CONTROL userspace - kernel interface
102PPPOL2TP_MSG_SEQ sequence numbers handling 182PPPOL2TP_MSG_SEQ sequence numbers handling
103PPPOL2TP_MSG_DATA data packets 183PPPOL2TP_MSG_DATA data packets
104 184
185If enabled, files under a l2tp debugfs directory can be used to dump
186kernel state about L2TP tunnels and sessions. To access it, the
187debugfs filesystem must first be mounted.
188
189# mount -t debugfs debugfs /debug
190
191Files under the l2tp directory can then be accessed.
192
193# cat /debug/l2tp/tunnels
194
195The debugfs files should not be used by applications to obtain L2TP
196state information because the file format is subject to change. It is
197implemented to provide extra debug information to help diagnose
198problems.) Users should use the netlink API.
199
200/proc/net/pppol2tp is also provided for backwards compaibility with
201the original pppol2tp driver. It lists information about L2TPv2
202tunnels and sessions only. Its use is discouraged.
203
204Unmanaged L2TPv3 Tunnels
205========================
206
207Some commercial L2TP products support unmanaged L2TPv3 ethernet
208tunnels, where there is no L2TP control protocol; tunnels are
209configured at each side manually. New commands are available in
210iproute2's ip utility to support this.
211
212To create an L2TPv3 ethernet pseudowire between local host 192.168.1.1
213and peer 192.168.1.2, using IP addresses 10.5.1.1 and 10.5.1.2 for the
214tunnel endpoints:-
215
216# modprobe l2tp_eth
217# modprobe l2tp_netlink
218
219# ip l2tp add tunnel tunnel_id 1 peer_tunnel_id 1 udp_sport 5000 \
220 udp_dport 5000 encap udp local 192.168.1.1 remote 192.168.1.2
221# ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
222# ifconfig -a
223# ip addr add 10.5.1.2/32 peer 10.5.1.1/32 dev l2tpeth0
224# ifconfig l2tpeth0 up
225
226Choose IP addresses to be the address of a local IP interface and that
227of the remote system. The IP addresses of the l2tpeth0 interface can be
228anything suitable.
229
230Repeat the above at the peer, with ports, tunnel/session ids and IP
231addresses reversed. The tunnel and session IDs can be any non-zero
23232-bit number, but the values must be reversed at the peer.
233
234Host 1 Host2
235udp_sport=5000 udp_sport=5001
236udp_dport=5001 udp_dport=5000
237tunnel_id=42 tunnel_id=45
238peer_tunnel_id=45 peer_tunnel_id=42
239session_id=128 session_id=5196755
240peer_session_id=5196755 peer_session_id=128
241
242When done at both ends of the tunnel, it should be possible to send
243data over the network. e.g.
244
245# ping 10.5.1.1
246
247
105Sample Userspace Code 248Sample Userspace Code
106===================== 249=====================
107 250
@@ -158,12 +301,48 @@ Sample Userspace Code
158 } 301 }
159 return 0; 302 return 0;
160 303
304Internal Implementation
305=======================
306
307The driver keeps a struct l2tp_tunnel context per L2TP tunnel and a
308struct l2tp_session context for each session. The l2tp_tunnel is
309always associated with a UDP or L2TP/IP socket and keeps a list of
310sessions in the tunnel. The l2tp_session context keeps kernel state
311about the session. It has private data which is used for data specific
312to the session type. With L2TPv2, the session always carried PPP
313traffic. With L2TPv3, the session can also carry ethernet frames
314(ethernet pseudowire) or other data types such as ATM, HDLC or Frame
315Relay.
316
317When a tunnel is first opened, the reference count on the socket is
318increased using sock_hold(). This ensures that the kernel socket
319cannot be removed while L2TP's data structures reference it.
320
321Some L2TP sessions also have a socket (PPP pseudowires) while others
322do not (ethernet pseudowires). We can't use the socket reference count
323as the reference count for session contexts. The L2TP implementation
324therefore has its own internal reference counts on the session
325contexts.
326
327To Do
328=====
329
330Add L2TP tunnel switching support. This would route tunneled traffic
331from one L2TP tunnel into another. Specified in
332http://tools.ietf.org/html/draft-ietf-l2tpext-tunnel-switching-08
333
334Add L2TPv3 VLAN pseudowire support.
335
336Add L2TPv3 IP pseudowire support.
337
338Add L2TPv3 ATM pseudowire support.
339
161Miscellaneous 340Miscellaneous
162============ 341=============
163 342
164The PPPoL2TP driver was developed as part of the OpenL2TP project by 343The L2TP drivers were developed as part of the OpenL2TP project by
165Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, 344Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server,
166designed from the ground up to have the L2TP datapath in the 345designed from the ground up to have the L2TP datapath in the
167kernel. The project also implemented the pppol2tp plugin for pppd 346kernel. The project also implemented the pppol2tp plugin for pppd
168which allows pppd to use the kernel driver. Details can be found at 347which allows pppd to use the kernel driver. Details can be found at
169http://openl2tp.sourceforge.net. 348http://www.openl2tp.org.
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 09ab0d290326..98f71a5cef00 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -100,7 +100,7 @@ by the kernel.
100The destruction of the socket and all associated resources 100The destruction of the socket and all associated resources
101is done by a simple call to close(fd). 101is done by a simple call to close(fd).
102 102
103Next I will describe PACKET_MMAP settings and it's constraints, 103Next I will describe PACKET_MMAP settings and its constraints,
104also the mapping of the circular buffer in the user process and 104also the mapping of the circular buffer in the user process and
105the use of this buffer. 105the use of this buffer.
106 106
@@ -432,7 +432,7 @@ TP_STATUS_LOSING : indicates there were packet drops from last time
432 the PACKET_STATISTICS option. 432 the PACKET_STATISTICS option.
433 433
434TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which 434TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which
435 it's checksum will be done in hardware. So while 435 its checksum will be done in hardware. So while
436 reading the packet we should not try to check the 436 reading the packet we should not try to check the
437 checksum. 437 checksum.
438 438
diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt
index 975cc87ebdd1..78f662ee0622 100644
--- a/Documentation/networking/x25-iface.txt
+++ b/Documentation/networking/x25-iface.txt
@@ -20,23 +20,23 @@ the rest of the skbuff, if any more information does exist.
20Packet Layer to Device Driver 20Packet Layer to Device Driver
21----------------------------- 21-----------------------------
22 22
23First Byte = 0x00 23First Byte = 0x00 (X25_IFACE_DATA)
24 24
25This indicates that the rest of the skbuff contains data to be transmitted 25This indicates that the rest of the skbuff contains data to be transmitted
26over the LAPB link. The LAPB link should already exist before any data is 26over the LAPB link. The LAPB link should already exist before any data is
27passed down. 27passed down.
28 28
29First Byte = 0x01 29First Byte = 0x01 (X25_IFACE_CONNECT)
30 30
31Establish the LAPB link. If the link is already established then the connect 31Establish the LAPB link. If the link is already established then the connect
32confirmation message should be returned as soon as possible. 32confirmation message should be returned as soon as possible.
33 33
34First Byte = 0x02 34First Byte = 0x02 (X25_IFACE_DISCONNECT)
35 35
36Terminate the LAPB link. If it is already disconnected then the disconnect 36Terminate the LAPB link. If it is already disconnected then the disconnect
37confirmation message should be returned as soon as possible. 37confirmation message should be returned as soon as possible.
38 38
39First Byte = 0x03 39First Byte = 0x03 (X25_IFACE_PARAMS)
40 40
41LAPB parameters. To be defined. 41LAPB parameters. To be defined.
42 42
@@ -44,22 +44,22 @@ LAPB parameters. To be defined.
44Device Driver to Packet Layer 44Device Driver to Packet Layer
45----------------------------- 45-----------------------------
46 46
47First Byte = 0x00 47First Byte = 0x00 (X25_IFACE_DATA)
48 48
49This indicates that the rest of the skbuff contains data that has been 49This indicates that the rest of the skbuff contains data that has been
50received over the LAPB link. 50received over the LAPB link.
51 51
52First Byte = 0x01 52First Byte = 0x01 (X25_IFACE_CONNECT)
53 53
54LAPB link has been established. The same message is used for both a LAPB 54LAPB link has been established. The same message is used for both a LAPB
55link connect_confirmation and a connect_indication. 55link connect_confirmation and a connect_indication.
56 56
57First Byte = 0x02 57First Byte = 0x02 (X25_IFACE_DISCONNECT)
58 58
59LAPB link has been terminated. This same message is used for both a LAPB 59LAPB link has been terminated. This same message is used for both a LAPB
60link disconnect_confirmation and a disconnect_indication. 60link disconnect_confirmation and a disconnect_indication.
61 61
62First Byte = 0x03 62First Byte = 0x03 (X25_IFACE_PARAMS)
63 63
64LAPB parameters. To be defined. 64LAPB parameters. To be defined.
65 65
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index c10c022b911c..6fe9001b9263 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -256,9 +256,13 @@ characters, each representing a particular tainted value.
256 9: 'A' if the ACPI table has been overridden. 256 9: 'A' if the ACPI table has been overridden.
257 257
258 10: 'W' if a warning has previously been issued by the kernel. 258 10: 'W' if a warning has previously been issued by the kernel.
259 (Though some warnings may set more specific taint flags.)
259 260
260 11: 'C' if a staging driver has been loaded. 261 11: 'C' if a staging driver has been loaded.
261 262
263 12: 'I' if the kernel is working around a severe bug in the platform
264 firmware (BIOS or similar).
265
262The primary reason for the 'Tainted: ' string is to tell kernel 266The primary reason for the 'Tainted: ' string is to tell kernel
263debuggers if this is a clean kernel or if anything unusual has 267debuggers if this is a clean kernel or if anything unusual has
264occurred. Tainting is permanent: even if an offending module is 268occurred. Tainting is permanent: even if an offending module is
diff --git a/Documentation/padata.txt b/Documentation/padata.txt
new file mode 100644
index 000000000000..269d7d0d8335
--- /dev/null
+++ b/Documentation/padata.txt
@@ -0,0 +1,107 @@
1The padata parallel execution mechanism
2Last updated for 2.6.34
3
4Padata is a mechanism by which the kernel can farm work out to be done in
5parallel on multiple CPUs while retaining the ordering of tasks. It was
6developed for use with the IPsec code, which needs to be able to perform
7encryption and decryption on large numbers of packets without reordering
8those packets. The crypto developers made a point of writing padata in a
9sufficiently general fashion that it could be put to other uses as well.
10
11The first step in using padata is to set up a padata_instance structure for
12overall control of how tasks are to be run:
13
14 #include <linux/padata.h>
15
16 struct padata_instance *padata_alloc(const struct cpumask *cpumask,
17 struct workqueue_struct *wq);
18
19The cpumask describes which processors will be used to execute work
20submitted to this instance. The workqueue wq is where the work will
21actually be done; it should be a multithreaded queue, naturally.
22
23There are functions for enabling and disabling the instance:
24
25 void padata_start(struct padata_instance *pinst);
26 void padata_stop(struct padata_instance *pinst);
27
28These functions literally do nothing beyond setting or clearing the
29"padata_start() was called" flag; if that flag is not set, other functions
30will refuse to work.
31
32The list of CPUs to be used can be adjusted with these functions:
33
34 int padata_set_cpumask(struct padata_instance *pinst,
35 cpumask_var_t cpumask);
36 int padata_add_cpu(struct padata_instance *pinst, int cpu);
37 int padata_remove_cpu(struct padata_instance *pinst, int cpu);
38
39Changing the CPU mask has the look of an expensive operation, though, so it
40probably should not be done with great frequency.
41
42Actually submitting work to the padata instance requires the creation of a
43padata_priv structure:
44
45 struct padata_priv {
46 /* Other stuff here... */
47 void (*parallel)(struct padata_priv *padata);
48 void (*serial)(struct padata_priv *padata);
49 };
50
51This structure will almost certainly be embedded within some larger
52structure specific to the work to be done. Most its fields are private to
53padata, but the structure should be zeroed at initialization time, and the
54parallel() and serial() functions should be provided. Those functions will
55be called in the process of getting the work done as we will see
56momentarily.
57
58The submission of work is done with:
59
60 int padata_do_parallel(struct padata_instance *pinst,
61 struct padata_priv *padata, int cb_cpu);
62
63The pinst and padata structures must be set up as described above; cb_cpu
64specifies which CPU will be used for the final callback when the work is
65done; it must be in the current instance's CPU mask. The return value from
66padata_do_parallel() is a little strange; zero is an error return
67indicating that the caller forgot the padata_start() formalities. -EBUSY
68means that somebody, somewhere else is messing with the instance's CPU
69mask, while -EINVAL is a complaint about cb_cpu not being in that CPU mask.
70If all goes well, this function will return -EINPROGRESS, indicating that
71the work is in progress.
72
73Each task submitted to padata_do_parallel() will, in turn, be passed to
74exactly one call to the above-mentioned parallel() function, on one CPU, so
75true parallelism is achieved by submitting multiple tasks. Despite the
76fact that the workqueue is used to make these calls, parallel() is run with
77software interrupts disabled and thus cannot sleep. The parallel()
78function gets the padata_priv structure pointer as its lone parameter;
79information about the actual work to be done is probably obtained by using
80container_of() to find the enclosing structure.
81
82Note that parallel() has no return value; the padata subsystem assumes that
83parallel() will take responsibility for the task from this point. The work
84need not be completed during this call, but, if parallel() leaves work
85outstanding, it should be prepared to be called again with a new job before
86the previous one completes. When a task does complete, parallel() (or
87whatever function actually finishes the job) should inform padata of the
88fact with a call to:
89
90 void padata_do_serial(struct padata_priv *padata);
91
92At some point in the future, padata_do_serial() will trigger a call to the
93serial() function in the padata_priv structure. That call will happen on
94the CPU requested in the initial call to padata_do_parallel(); it, too, is
95done through the workqueue, but with local software interrupts disabled.
96Note that this call may be deferred for a while since the padata code takes
97pains to ensure that tasks are completed in the order in which they were
98submitted.
99
100The one remaining function in the padata API should be called to clean up
101when a padata instance is no longer needed:
102
103 void padata_free(struct padata_instance *pinst);
104
105This function will busy-wait while any remaining tasks are completed, so it
106might be best not to call it while there is work outstanding. Shutting
107down the workqueue, if necessary, should be done separately.
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 446f43b309df..61bc4e943116 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,4 +1,17 @@
1This file details changes in 2.6 which affect PCMCIA card driver authors: 1This file details changes in 2.6 which affect PCMCIA card driver authors:
2* No dev_node_t (as of 2.6.35)
3 There is no more need to fill out a "dev_node_t" structure.
4
5* New IRQ request rules (as of 2.6.35)
6 Instead of the old pcmcia_request_irq() interface, drivers may now
7 choose between:
8 - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq.
9 - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will
10 clean up automatically on calls to pcmcia_disable_device() or
11 device ejection.
12 - drivers still not capable of IRQF_SHARED (or not telling us so) may
13 use the deprecated pcmcia_request_exclusive_irq() for the time
14 being; they might receive a shared IRQ nonetheless.
2 15
3* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) 16* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
4 Instead of the cs_error() callback or the CS_CHECK() macro, please use 17 Instead of the cs_error() callback or the CS_CHECK() macro, please use
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index c9abbd86bc18..57080cd74575 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -1,7 +1,13 @@
1Device Power Management
2
3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
5
6
1Most of the code in Linux is device drivers, so most of the Linux power 7Most of the code in Linux is device drivers, so most of the Linux power
2management code is also driver-specific. Most drivers will do very little; 8management (PM) code is also driver-specific. Most drivers will do very
3others, especially for platforms with small batteries (like cell phones), 9little; others, especially for platforms with small batteries (like cell
4will do a lot. 10phones), will do a lot.
5 11
6This writeup gives an overview of how drivers interact with system-wide 12This writeup gives an overview of how drivers interact with system-wide
7power management goals, emphasizing the models and interfaces that are 13power management goals, emphasizing the models and interfaces that are
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power
15states: 21states:
16 22
17 System Sleep model: 23 System Sleep model:
18 Drivers can enter low power states as part of entering system-wide 24 Drivers can enter low-power states as part of entering system-wide
19 low-power states like "suspend-to-ram", or (mostly for systems with 25 low-power states like "suspend" (also known as "suspend-to-RAM"), or
20 disks) "hibernate" (suspend-to-disk). 26 (mostly for systems with disks) "hibernation" (also known as
27 "suspend-to-disk").
21 28
22 This is something that device, bus, and class drivers collaborate on 29 This is something that device, bus, and class drivers collaborate on
23 by implementing various role-specific suspend and resume methods to 30 by implementing various role-specific suspend and resume methods to
@@ -25,33 +32,41 @@ states:
25 them without loss of data. 32 them without loss of data.
26 33
27 Some drivers can manage hardware wakeup events, which make the system 34 Some drivers can manage hardware wakeup events, which make the system
28 leave that low-power state. This feature may be disabled using the 35 leave the low-power state. This feature may be enabled or disabled
29 relevant /sys/devices/.../power/wakeup file; enabling it may cost some 36 using the relevant /sys/devices/.../power/wakeup file (for Ethernet
30 power usage, but let the whole system enter low power states more often. 37 drivers the ioctl interface used by ethtool may also be used for this
38 purpose); enabling it may cost some power usage, but let the whole
39 system enter low-power states more often.
31 40
32 Runtime Power Management model: 41 Runtime Power Management model:
33 Drivers may also enter low power states while the system is running, 42 Devices may also be put into low-power states while the system is
34 independently of other power management activity. Upstream drivers 43 running, independently of other power management activity in principle.
35 will normally not know (or care) if the device is in some low power 44 However, devices are not generally independent of each other (for
36 state when issuing requests; the driver will auto-resume anything 45 example, a parent device cannot be suspended unless all of its child
37 that's needed when it gets a request. 46 devices have been suspended). Moreover, depending on the bus type the
38 47 device is on, it may be necessary to carry out some bus-specific
39 This doesn't have, or need much infrastructure; it's just something you 48 operations on the device for this purpose. Devices put into low power
40 should do when writing your drivers. For example, clk_disable() unused 49 states at run time may require special handling during system-wide power
41 clocks as part of minimizing power drain for currently-unused hardware. 50 transitions (suspend or hibernation).
42 Of course, sometimes clusters of drivers will collaborate with each 51
43 other, which could involve task-specific power management. 52 For these reasons not only the device driver itself, but also the
44 53 appropriate subsystem (bus type, device type or device class) driver and
45There's not a lot to be said about those low power states except that they 54 the PM core are involved in runtime power management. As in the system
46are very system-specific, and often device-specific. Also, that if enough 55 sleep power management case, they need to collaborate by implementing
47drivers put themselves into low power states (at "runtime"), the effect may be 56 various role-specific suspend and resume methods, so that the hardware
48the same as entering some system-wide low-power state (system sleep) ... and 57 is cleanly powered down and reactivated without data or service loss.
49that synergies exist, so that several drivers using runtime pm might put the 58
50system into a state where even deeper power saving options are available. 59There's not a lot to be said about those low-power states except that they are
51 60very system-specific, and often device-specific. Also, that if enough devices
52Most suspended devices will have quiesced all I/O: no more DMA or irqs, no 61have been put into low-power states (at runtime), the effect may be very similar
53more data read or written, and requests from upstream drivers are no longer 62to entering some system-wide low-power state (system sleep) ... and that
54accepted. A given bus or platform may have different requirements though. 63synergies exist, so that several drivers using runtime PM might put the system
64into a state where even deeper power saving options are available.
65
66Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
67for wakeup events), no more data read or written, and requests from upstream
68drivers are no longer accepted. A given bus or platform may have different
69requirements though.
55 70
56Examples of hardware wakeup events include an alarm from a real time clock, 71Examples of hardware wakeup events include an alarm from a real time clock,
57network wake-on-LAN packets, keyboard or mouse activity, and media insertion 72network wake-on-LAN packets, keyboard or mouse activity, and media insertion
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on).
60 75
61Interfaces for Entering System Sleep States 76Interfaces for Entering System Sleep States
62=========================================== 77===========================================
63Most of the programming interfaces a device driver needs to know about 78There are programming interfaces provided for subsystems (bus type, device type,
64relate to that first model: entering a system-wide low power state, 79device class) and device drivers to allow them to participate in the power
65rather than just minimizing power consumption by one device. 80management of devices they are concerned with. These interfaces cover both
66 81system sleep and runtime power management.
67 82
68Bus Driver Methods 83
69------------------ 84Device Power Management Operations
70The core methods to suspend and resume devices reside in struct bus_type. 85----------------------------------
71These are mostly of interest to people writing infrastructure for busses 86Device power management operations, at the subsystem level as well as at the
72like PCI or USB, or because they define the primitives that device drivers 87device driver level, are implemented by defining and populating objects of type
73may need to apply in domain-specific ways to their devices: 88struct dev_pm_ops:
74 89
75struct bus_type { 90struct dev_pm_ops {
76 ... 91 int (*prepare)(struct device *dev);
77 int (*suspend)(struct device *dev, pm_message_t state); 92 void (*complete)(struct device *dev);
78 int (*resume)(struct device *dev); 93 int (*suspend)(struct device *dev);
94 int (*resume)(struct device *dev);
95 int (*freeze)(struct device *dev);
96 int (*thaw)(struct device *dev);
97 int (*poweroff)(struct device *dev);
98 int (*restore)(struct device *dev);
99 int (*suspend_noirq)(struct device *dev);
100 int (*resume_noirq)(struct device *dev);
101 int (*freeze_noirq)(struct device *dev);
102 int (*thaw_noirq)(struct device *dev);
103 int (*poweroff_noirq)(struct device *dev);
104 int (*restore_noirq)(struct device *dev);
105 int (*runtime_suspend)(struct device *dev);
106 int (*runtime_resume)(struct device *dev);
107 int (*runtime_idle)(struct device *dev);
79}; 108};
80 109
81Bus drivers implement those methods as appropriate for the hardware and 110This structure is defined in include/linux/pm.h and the methods included in it
82the drivers using it; PCI works differently from USB, and so on. Not many 111are also described in that file. Their roles will be explained in what follows.
83people write bus drivers; most driver code is a "device driver" that 112For now, it should be sufficient to remember that the last three methods are
84builds on top of bus-specific framework code. 113specific to runtime power management while the remaining ones are used during
114system-wide power transitions.
85 115
86For more information on these driver calls, see the description later; 116There also is a deprecated "old" or "legacy" interface for power management
87they are called in phases for every device, respecting the parent-child 117operations available at least for some subsystems. This approach does not use
88sequencing in the driver model tree. Note that as this is being written, 118struct dev_pm_ops objects and it is suitable only for implementing system sleep
89only the suspend() and resume() are widely available; not many bus drivers 119power management methods. Therefore it is not described in this document, so
90leverage all of those phases, or pass them down to lower driver levels. 120please refer directly to the source code for more information about it.
91 121
92 122
93/sys/devices/.../power/wakeup files 123Subsystem-Level Methods
94----------------------------------- 124-----------------------
95All devices in the driver model have two flags to control handling of 125The core methods to suspend and resume devices reside in struct dev_pm_ops
96wakeup events, which are hardware signals that can force the device and/or 126pointed to by the pm member of struct bus_type, struct device_type and
97system out of a low power state. These are initialized by bus or device 127struct class. They are mostly of interest to the people writing infrastructure
98driver code using device_init_wakeup(dev,can_wakeup). 128for buses, like PCI or USB, or device type and device class drivers.
99 129
100The "can_wakeup" flag just records whether the device (and its driver) can 130Bus drivers implement these methods as appropriate for the hardware and the
101physically support wakeup events. When that flag is clear, the sysfs 131drivers using it; PCI works differently from USB, and so on. Not many people
102"wakeup" file is empty, and device_may_wakeup() returns false. 132write subsystem-level drivers; most driver code is a "device driver" that builds
133on top of bus-specific framework code.
103 134
104For devices that can issue wakeup events, a separate flag controls whether 135For more information on these driver calls, see the description later;
105that device should try to use its wakeup mechanism. The initial value of 136they are called in phases for every device, respecting the parent-child
106device_may_wakeup() will be true, so that the device's "wakeup" file holds 137sequencing in the driver model tree.
107the value "enabled". Userspace can change that to "disabled" so that
108device_may_wakeup() returns false; or change it back to "enabled" (so that
109it returns true again).
110 138
111 139
112EXAMPLE: PCI Device Driver Methods 140/sys/devices/.../power/wakeup files
113----------------------------------- 141-----------------------------------
114PCI framework software calls these methods when the PCI device driver bound 142All devices in the driver model have two flags to control handling of wakeup
115to a device device has provided them: 143events (hardware signals that can force the device and/or system out of a low
116 144power state). These flags are initialized by bus or device driver code using
117struct pci_driver { 145device_set_wakeup_capable() and device_set_wakeup_enable(), defined in
118 ... 146include/linux/pm_wakeup.h.
119 int (*suspend)(struct pci_device *pdev, pm_message_t state);
120 int (*suspend_late)(struct pci_device *pdev, pm_message_t state);
121 147
122 int (*resume_early)(struct pci_device *pdev); 148The "can_wakeup" flag just records whether the device (and its driver) can
123 int (*resume)(struct pci_device *pdev); 149physically support wakeup events. The device_set_wakeup_capable() routine
124}; 150affects this flag. The "should_wakeup" flag controls whether the device should
125 151try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag;
126Drivers will implement those methods, and call PCI-specific procedures 152for the most part drivers should not change its value. The initial value of
127like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and 153should_wakeup is supposed to be false for the majority of devices; the major
128pci_restore_state() to manage PCI-specific mechanisms. (PCI config space 154exceptions are power buttons, keyboards, and Ethernet adapters whose WoL
129could be saved during driver probe, if it weren't for the fact that some 155(wake-on-LAN) feature has been set up with ethtool.
130systems rely on userspace tweaking using setpci.) Devices are suspended 156
131before their bridges enter low power states, and likewise bridges resume 157Whether or not a device is capable of issuing wakeup events is a hardware
132before their devices. 158matter, and the kernel is responsible for keeping track of it. By contrast,
133 159whether or not a wakeup-capable device should issue wakeup events is a policy
134 160decision, and it is managed by user space through a sysfs attribute: the
135Upper Layers of Driver Stacks 161power/wakeup file. User space can write the strings "enabled" or "disabled" to
136----------------------------- 162set or clear the should_wakeup flag, respectively. Reads from the file will
137Device drivers generally have at least two interfaces, and the methods 163return the corresponding string if can_wakeup is true, but if can_wakeup is
138sketched above are the ones which apply to the lower level (nearer PCI, USB, 164false then reads will return an empty string, to indicate that the device
139or other bus hardware). The network and block layers are examples of upper 165doesn't support wakeup events. (But even though the file appears empty, writes
140level interfaces, as is a character device talking to userspace. 166will still affect the should_wakeup flag.)
141 167
142Power management requests normally need to flow through those upper levels, 168The device_may_wakeup() routine returns true only if both flags are set.
143which often use domain-oriented requests like "blank that screen". In 169Drivers should check this routine when putting devices in a low-power state
144some cases those upper levels will have power management intelligence that 170during a system sleep transition, to see whether or not to enable the devices'
145relates to end-user activity, or other devices that work in cooperation. 171wakeup mechanisms. However for runtime power management, wakeup events should
146 172be enabled whenever the device and driver both support them, regardless of the
147When those interfaces are structured using class interfaces, there is a 173should_wakeup flag.
148standard way to have the upper layer stop issuing requests to a given 174
149class device (and restart later): 175
150 176/sys/devices/.../power/control files
151struct class { 177------------------------------------
152 ... 178Each device in the driver model has a flag to control whether it is subject to
153 int (*suspend)(struct device *dev, pm_message_t state); 179runtime power management. This flag, called runtime_auto, is initialized by the
154 int (*resume)(struct device *dev); 180bus type (or generally subsystem) code using pm_runtime_allow() or
155}; 181pm_runtime_forbid(); the default is to allow runtime power management.
156 182
157Those calls are issued in specific phases of the process by which the 183The setting can be adjusted by user space by writing either "on" or "auto" to
158system enters a low power "suspend" state, or resumes from it. 184the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(),
159 185setting the flag and allowing the device to be runtime power-managed by its
160 186driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning
161Calling Drivers to Enter System Sleep States 187the device to full power if it was in a low-power state, and preventing the
162============================================ 188device from being runtime power-managed. User space can check the current value
163When the system enters a low power state, each device's driver is asked 189of the runtime_auto flag by reading the file.
164to suspend the device by putting it into state compatible with the target 190
191The device's runtime_auto flag has no effect on the handling of system-wide
192power transitions. In particular, the device can (and in the majority of cases
193should and will) be put into a low-power state during a system-wide transition
194to a sleep state even though its runtime_auto flag is clear.
195
196For more information about the runtime power management framework, refer to
197Documentation/power/runtime_pm.txt.
198
199
200Calling Drivers to Enter and Leave System Sleep States
201======================================================
202When the system goes into a sleep state, each device's driver is asked to
203suspend the device by putting it into a state compatible with the target
165system state. That's usually some version of "off", but the details are 204system state. That's usually some version of "off", but the details are
166system-specific. Also, wakeup-enabled devices will usually stay partly 205system-specific. Also, wakeup-enabled devices will usually stay partly
167functional in order to wake the system. 206functional in order to wake the system.
168 207
169When the system leaves that low power state, the device's driver is asked 208When the system leaves that low-power state, the device's driver is asked to
170to resume it. The suspend and resume operations always go together, and 209resume it by returning it to full power. The suspend and resume operations
171both are multi-phase operations. 210always go together, and both are multi-phase operations.
172 211
173For simple drivers, suspend might quiesce the device using the class code 212For simple drivers, suspend might quiesce the device using class code
174and then turn its hardware as "off" as possible with late_suspend. The 213and then turn its hardware as "off" as possible during suspend_noirq. The
175matching resume calls would then completely reinitialize the hardware 214matching resume calls would then completely reinitialize the hardware
176before reactivating its class I/O queues. 215before reactivating its class I/O queues.
177 216
178More power-aware drivers drivers will use more than one device low power 217More power-aware drivers might prepare the devices for triggering system wakeup
179state, either at runtime or during system sleep states, and might trigger 218events.
180system wakeup events.
181 219
182 220
183Call Sequence Guarantees 221Call Sequence Guarantees
184------------------------ 222------------------------
185To ensure that bridges and similar links needed to talk to a device are 223To ensure that bridges and similar links needing to talk to a device are
186available when the device is suspended or resumed, the device tree is 224available when the device is suspended or resumed, the device tree is
187walked in a bottom-up order to suspend devices. A top-down order is 225walked in a bottom-up order to suspend devices. A top-down order is
188used to resume those devices. 226used to resume those devices.
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent.
194The policy is that the device tree should match hardware bus topology. 232The policy is that the device tree should match hardware bus topology.
195(Or at least the control bus, for devices which use multiple busses.) 233(Or at least the control bus, for devices which use multiple busses.)
196In particular, this means that a device registration may fail if the parent of 234In particular, this means that a device registration may fail if the parent of
197the device is suspending (ie. has been chosen by the PM core as the next 235the device is suspending (i.e. has been chosen by the PM core as the next
198device to suspend) or has already suspended, as well as after all of the other 236device to suspend) or has already suspended, as well as after all of the other
199devices have been suspended. Device drivers must be prepared to cope with such 237devices have been suspended. Device drivers must be prepared to cope with such
200situations. 238situations.
201 239
202 240
203Suspending Devices 241System Power Management Phases
204------------------ 242------------------------------
205Suspending a given device is done in several phases. Suspending the 243Suspending or resuming the system is done in several phases. Different phases
206system always includes every phase, executing calls for every device 244are used for standby or memory sleep states ("suspend-to-RAM") and the
207before the next phase begins. Not all busses or classes support all 245hibernation state ("suspend-to-disk"). Each phase involves executing callbacks
208these callbacks; and not all drivers use all the callbacks. 246for every device before the next phase begins. Not all busses or classes
247support all these callbacks and not all drivers use all the callbacks. The
248various phases always run after tasks have been frozen and before they are
249unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have
250been disabled (except for those marked with the IRQ_WAKEUP flag).
209 251
210The phases are seen by driver notifications issued in this order: 252Most phases use bus, type, and class callbacks (that is, methods defined in
253dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete
254phases are exceptions; they use only bus callbacks. When multiple callbacks
255are used in a phase, they are invoked in the order: <class, type, bus> during
256power-down transitions and in the opposite order during power-up transitions.
257For example, during the suspend phase the PM core invokes
211 258
212 1 class.suspend(dev, message) is called after tasks are frozen, for 259 dev->class->pm.suspend(dev);
213 devices associated with a class that has such a method. This 260 dev->type->pm.suspend(dev);
214 method may sleep. 261 dev->bus->pm.suspend(dev);
215 262
216 Since I/O activity usually comes from such higher layers, this is 263before moving on to the next device, whereas during the resume phase the core
217 a good place to quiesce all drivers of a given type (and keep such 264invokes
218 code out of those drivers).
219 265
220 2 bus.suspend(dev, message) is called next. This method may sleep, 266 dev->bus->pm.resume(dev);
221 and is often morphed into a device driver call with bus-specific 267 dev->type->pm.resume(dev);
222 parameters and/or rules. 268 dev->class->pm.resume(dev);
223 269
224 This call should handle parts of device suspend logic that require 270These callbacks may in turn invoke device- or driver-specific methods stored in
225 sleeping. It probably does work to quiesce the device which hasn't 271dev->driver->pm, but they don't have to.
226 been abstracted into class.suspend().
227 272
228The pm_message_t parameter is currently used to refine those semantics
229(described later).
230 273
231At the end of those phases, drivers should normally have stopped all I/O 274Entering System Suspend
232transactions (DMA, IRQs), saved enough state that they can re-initialize 275-----------------------
233or restore previous state (as needed by the hardware), and placed the 276When the system goes into the standby or memory sleep state, the phases are:
234device into a low-power state. On many platforms they will also use 277
235clk_disable() to gate off one or more clock sources; sometimes they will 278 prepare, suspend, suspend_noirq.
236also switch off power supplies, or reduce voltages. Drivers which have 279
237runtime PM support may already have performed some or all of the steps 280 1. The prepare phase is meant to prevent races by preventing new devices
238needed to prepare for the upcoming system sleep state. 281 from being registered; the PM core would never know that all the
282 children of a device had been suspended if new children could be
283 registered at will. (By contrast, devices may be unregistered at any
284 time.) Unlike the other suspend-related phases, during the prepare
285 phase the device tree is traversed top-down.
286
287 The prepare phase uses only a bus callback. After the callback method
288 returns, no new children may be registered below the device. The method
289 may also prepare the device or driver in some way for the upcoming
290 system power transition, but it should not put the device into a
291 low-power state.
292
293 2. The suspend methods should quiesce the device to stop it from performing
294 I/O. They also may save the device registers and put it into the
295 appropriate low-power state, depending on the bus type the device is on,
296 and they may enable wakeup events.
297
298 3. The suspend_noirq phase occurs after IRQ handlers have been disabled,
299 which means that the driver's interrupt handler will not be called while
300 the callback method is running. The methods should save the values of
301 the device's registers that weren't saved previously and finally put the
302 device into the appropriate low-power state.
303
304 The majority of subsystems and device drivers need not implement this
305 callback. However, bus types allowing devices to share interrupt
306 vectors, like PCI, generally need it; otherwise a driver might encounter
307 an error during the suspend phase by fielding a shared interrupt
308 generated by some other device after its own device had been set to low
309 power.
310
311At the end of these phases, drivers should have stopped all I/O transactions
312(DMA, IRQs), saved enough state that they can re-initialize or restore previous
313state (as needed by the hardware), and placed the device into a low-power state.
314On many platforms they will gate off one or more clock sources; sometimes they
315will also switch off power supplies or reduce voltages. (Drivers supporting
316runtime PM may already have performed some or all of these steps.)
317
318If device_may_wakeup(dev) returns true, the device should be prepared for
319generating hardware wakeup signals to trigger a system wakeup event when the
320system is in the sleep state. For example, enable_irq_wake() might identify
321GPIO signals hooked up to a switch or other external hardware, and
322pci_enable_wake() does something similar for the PCI PME signal.
323
324If any of these callbacks returns an error, the system won't enter the desired
325low-power state. Instead the PM core will unwind its actions by resuming all
326the devices that were suspended.
327
328
329Leaving System Suspend
330----------------------
331When resuming from standby or memory sleep, the phases are:
332
333 resume_noirq, resume, complete.
334
335 1. The resume_noirq callback methods should perform any actions needed
336 before the driver's interrupt handlers are invoked. This generally
337 means undoing the actions of the suspend_noirq phase. If the bus type
338 permits devices to share interrupt vectors, like PCI, the method should
339 bring the device and its driver into a state in which the driver can
340 recognize if the device is the source of incoming interrupts, if any,
341 and handle them correctly.
342
343 For example, the PCI bus type's ->pm.resume_noirq() puts the device into
344 the full-power state (D0 in the PCI terminology) and restores the
345 standard configuration registers of the device. Then it calls the
346 device driver's ->pm.resume_noirq() method to perform device-specific
347 actions.
348
349 2. The resume methods should bring the the device back to its operating
350 state, so that it can perform normal I/O. This generally involves
351 undoing the actions of the suspend phase.
352
353 3. The complete phase uses only a bus callback. The method should undo the
354 actions of the prepare phase. Note, however, that new children may be
355 registered below the device as soon as the resume callbacks occur; it's
356 not necessary to wait until the complete phase.
357
358At the end of these phases, drivers should be as functional as they were before
359suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
360gated on. Even if the device was in a low-power state before the system sleep
361because of runtime power management, afterwards it should be back in its
362full-power state. There are multiple reasons why it's best to do this; they are
363discussed in more detail in Documentation/power/runtime_pm.txt.
239 364
240When any driver sees that its device_can_wakeup(dev), it should make sure 365However, the details here may again be platform-specific. For example,
241to use the relevant hardware signals to trigger a system wakeup event. 366some systems support multiple "run" states, and the mode in effect at
242For example, enable_irq_wake() might identify GPIO signals hooked up to 367the end of resume might not be the one which preceded suspension.
243a switch or other external hardware, and pci_enable_wake() does something 368That means availability of certain clocks or power supplies changed,
244similar for PCI's PME# signal. 369which could easily affect how a driver works.
370
371Drivers need to be able to handle hardware which has been reset since the
372suspend methods were called, for example by complete reinitialization.
373This may be the hardest part, and the one most protected by NDA'd documents
374and chip errata. It's simplest if the hardware state hasn't changed since
375the suspend was carried out, but that can't be guaranteed (in fact, it ususally
376is not the case).
377
378Drivers must also be prepared to notice that the device has been removed
379while the system was powered down, whenever that's physically possible.
380PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
381where common Linux platforms will see such removal. Details of how drivers
382will notice and handle such removals are currently bus-specific, and often
383involve a separate thread.
384
385These callbacks may return an error value, but the PM core will ignore such
386errors since there's nothing it can do about them other than printing them in
387the system log.
388
389
390Entering Hibernation
391--------------------
392Hibernating the system is more complicated than putting it into the standby or
393memory sleep state, because it involves creating and saving a system image.
394Therefore there are more phases for hibernation, with a different set of
395callbacks. These phases always run after tasks have been frozen and memory has
396been freed.
397
398The general procedure for hibernation is to quiesce all devices (freeze), create
399an image of the system memory while everything is stable, reactivate all
400devices (thaw), write the image to permanent storage, and finally shut down the
401system (poweroff). The phases used to accomplish this are:
402
403 prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
404 prepare, poweroff, poweroff_noirq
405
406 1. The prepare phase is discussed in the "Entering System Suspend" section
407 above.
408
409 2. The freeze methods should quiesce the device so that it doesn't generate
410 IRQs or DMA, and they may need to save the values of device registers.
411 However the device does not have to be put in a low-power state, and to
412 save time it's best not to do so. Also, the device should not be
413 prepared to generate wakeup events.
414
415 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed
416 above, except again that the device should not be put in a low-power
417 state and should not be allowed to generate wakeup events.
418
419At this point the system image is created. All devices should be inactive and
420the contents of memory should remain undisturbed while this happens, so that the
421image forms an atomic snapshot of the system state.
422
423 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed
424 above. The main difference is that its methods can assume the device is
425 in the same state as at the end of the freeze_noirq phase.
426
427 5. The thaw phase is analogous to the resume phase discussed above. Its
428 methods should bring the device back to an operating state, so that it
429 can be used for saving the image if necessary.
430
431 6. The complete phase is discussed in the "Leaving System Suspend" section
432 above.
433
434At this point the system image is saved, and the devices then need to be
435prepared for the upcoming system shutdown. This is much like suspending them
436before putting the system into the standby or memory sleep state, and the phases
437are similar.
438
439 7. The prepare phase is discussed above.
440
441 8. The poweroff phase is analogous to the suspend phase.
442
443 9. The poweroff_noirq phase is analogous to the suspend_noirq phase.
444
445The poweroff and poweroff_noirq callbacks should do essentially the same things
446as the suspend and suspend_noirq callbacks. The only notable difference is that
447they need not store the device register values, because the registers should
448already have been stored during the freeze or freeze_noirq phases.
449
450
451Leaving Hibernation
452-------------------
453Resuming from hibernation is, again, more complicated than resuming from a sleep
454state in which the contents of main memory are preserved, because it requires
455a system image to be loaded into memory and the pre-hibernation memory contents
456to be restored before control can be passed back to the image kernel.
457
458Although in principle, the image might be loaded into memory and the
459pre-hibernation memory contents restored by the boot loader, in practice this
460can't be done because boot loaders aren't smart enough and there is no
461established protocol for passing the necessary information. So instead, the
462boot loader loads a fresh instance of the kernel, called the boot kernel, into
463memory and passes control to it in the usual way. Then the boot kernel reads
464the system image, restores the pre-hibernation memory contents, and passes
465control to the image kernel. Thus two different kernels are involved in
466resuming from hibernation. In fact, the boot kernel may be completely different
467from the image kernel: a different configuration and even a different version.
468This has important consequences for device drivers and their subsystems.
469
470To be able to load the system image into memory, the boot kernel needs to
471include at least a subset of device drivers allowing it to access the storage
472medium containing the image, although it doesn't need to include all of the
473drivers present in the image kernel. After the image has been loaded, the
474devices managed by the boot kernel need to be prepared for passing control back
475to the image kernel. This is very similar to the initial steps involved in
476creating a system image, and it is accomplished in the same way, using prepare,
477freeze, and freeze_noirq phases. However the devices affected by these phases
478are only those having drivers in the boot kernel; other devices will still be in
479whatever state the boot loader left them.
480
481Should the restoration of the pre-hibernation memory contents fail, the boot
482kernel would go through the "thawing" procedure described above, using the
483thaw_noirq, thaw, and complete phases, and then continue running normally. This
484happens only rarely. Most often the pre-hibernation memory contents are
485restored successfully and control is passed to the image kernel, which then
486becomes responsible for bringing the system back to the working state.
487
488To achieve this, the image kernel must restore the devices' pre-hibernation
489functionality. The operation is much like waking up from the memory sleep
490state, although it involves different phases:
491
492 restore_noirq, restore, complete
493
494 1. The restore_noirq phase is analogous to the resume_noirq phase.
495
496 2. The restore phase is analogous to the resume phase.
497
498 3. The complete phase is discussed above.
499
500The main difference from resume[_noirq] is that restore[_noirq] must assume the
501device has been accessed and reconfigured by the boot loader or the boot kernel.
502Consequently the state of the device may be different from the state remembered
503from the freeze and freeze_noirq phases. The device may even need to be reset
504and completely re-initialized. In many cases this difference doesn't matter, so
505the resume[_noirq] and restore[_norq] method pointers can be set to the same
506routines. Nevertheless, different callback pointers are used in case there is a
507situation where it actually matters.
245 508
246If a driver (or bus, or class) fails it suspend method, the system won't
247enter the desired low power state; it will resume all the devices it's
248suspended so far.
249 509
250Note that drivers may need to perform different actions based on the target 510System Devices
251system lowpower/sleep state. At this writing, there are only platform 511--------------
252specific APIs through which drivers could determine those target states. 512System devices (sysdevs) follow a slightly different API, which can be found in
513
514 include/linux/sysdev.h
515 drivers/base/sys.c
516
517System devices will be suspended with interrupts disabled, and after all other
518devices have been suspended. On resume, they will be resumed before any other
519devices, and also with interrupts disabled. These things occur in special
520"sysdev_driver" phases, which affect only system devices.
521
522Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when
523the non-boot CPUs are all offline and IRQs are disabled on the remaining online
524CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a
525sleep state (or a system image is created). During resume (or after the image
526has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs
527are enabled on the only online CPU, the non-boot CPUs are enabled, and the
528resume_noirq (or thaw_noirq or restore_noirq) phase begins.
529
530Code to actually enter and exit the system-wide low power state sometimes
531involves hardware details that are only known to the boot firmware, and
532may leave a CPU running software (from SRAM or flash memory) that monitors
533the system and manages its wakeup sequence.
253 534
254 535
255Device Low Power (suspend) States 536Device Low Power (suspend) States
256--------------------------------- 537---------------------------------
257Device low-power states aren't very standard. One device might only handle 538Device low-power states aren't standard. One device might only handle
258"on" and "off, while another might support a dozen different versions of 539"on" and "off, while another might support a dozen different versions of
259"on" (how many engines are active?), plus a state that gets back to "on" 540"on" (how many engines are active?), plus a state that gets back to "on"
260faster than from a full "off". 541faster than from a full "off".
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it
265issues would be issued through the PME# bus signal. Plus, there are 546issues would be issued through the PME# bus signal. Plus, there are
266several PCI-standard device states, some of which are optional. 547several PCI-standard device states, some of which are optional.
267 548
268In contrast, integrated system-on-chip processors often use irqs as the 549In contrast, integrated system-on-chip processors often use IRQs as the
269wakeup event sources (so drivers would call enable_irq_wake) and might 550wakeup event sources (so drivers would call enable_irq_wake) and might
270be able to treat DMA completion as a wakeup event (sometimes DMA can stay 551be able to treat DMA completion as a wakeup event (sometimes DMA can stay
271active too, it'd only be the CPU and some peripherals that sleep). 552active too, it'd only be the CPU and some peripherals that sleep).
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby",
284but a different product using the same SOC might work differently. 565but a different product using the same SOC might work differently.
285 566
286 567
287Meaning of pm_message_t.event 568Power Management Notifiers
288----------------------------- 569--------------------------
289Parameters to suspend calls include the device affected and a message of 570There are some operations that cannot be carried out by the power management
290type pm_message_t, which has one field: the event. If driver does not 571callbacks discussed above, because the callbacks occur too late or too early.
291recognize the event code, suspend calls may abort the request and return 572To handle these cases, subsystems and device drivers may register power
292a negative errno. However, most drivers will be fine if they implement 573management notifiers that are called before tasks are frozen and after they have
293PM_EVENT_SUSPEND semantics for all messages. 574been thawed. Generally speaking, the PM notifiers are suitable for performing
575actions that either require user space to be available, or at least won't
576interfere with user space.
294 577
295The event codes are used to refine the goal of suspending the device, and 578For details refer to Documentation/power/notifiers.txt.
296mostly matter when creating or resuming system memory image snapshots, as
297used with suspend-to-disk:
298
299 PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power
300 state. When used with system sleep states like "suspend-to-RAM" or
301 "standby", the upcoming resume() call will often be able to rely on
302 state kept in hardware, or issue system wakeup events.
303
304 PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup
305 events as appropriate. It is only used with hibernation
306 (suspend-to-disk) and few devices are able to wake up the system from
307 this state; most are completely powered off.
308
309 PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into
310 any low power mode. A system snapshot is about to be taken, often
311 followed by a call to the driver's resume() method. Neither wakeup
312 events nor DMA are allowed.
313
314 PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume()
315 will restore a suspend-to-disk snapshot from a different kernel image.
316 Drivers that are smart enough to look at their hardware state during
317 resume() processing need that state to be correct ... a PRETHAW could
318 be used to invalidate that state (by resetting the device), like a
319 shutdown() invocation would before a kexec() or system halt. Other
320 drivers might handle this the same way as PM_EVENT_FREEZE. Neither
321 wakeup events nor DMA are allowed.
322
323To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or
324the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event
325codes are used for hibernation ("Suspend to Disk", STD, ACPI S4).
326
327There's also PM_EVENT_ON, a value which never appears as a suspend event
328but is sometimes used to record the "not suspended" device state.
329
330
331Resuming Devices
332----------------
333Resuming is done in multiple phases, much like suspending, with all
334devices processing each phase's calls before the next phase begins.
335
336The phases are seen by driver notifications issued in this order:
337
338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
339 be morphed into a device driver call with bus-specific parameters;
340 implementations may sleep.
341
342 2 class.resume(dev) is called for devices associated with a class
343 that has such a method. Implementations may sleep.
344
345 This reverses the effects of class.suspend(), and would usually
346 reactivate the device's I/O queue.
347
348At the end of those phases, drivers should normally be as functional as
349they were before suspending: I/O can be performed using DMA and IRQs, and
350the relevant clocks are gated on. The device need not be "fully on"; it
351might be in a runtime lowpower/suspend state that acts as if it were.
352
353However, the details here may again be platform-specific. For example,
354some systems support multiple "run" states, and the mode in effect at
355the end of resume() might not be the one which preceded suspension.
356That means availability of certain clocks or power supplies changed,
357which could easily affect how a driver works.
358
359
360Drivers need to be able to handle hardware which has been reset since the
361suspend methods were called, for example by complete reinitialization.
362This may be the hardest part, and the one most protected by NDA'd documents
363and chip errata. It's simplest if the hardware state hasn't changed since
364the suspend() was called, but that can't always be guaranteed.
365
366Drivers must also be prepared to notice that the device has been removed
367while the system was powered off, whenever that's physically possible.
368PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
369where common Linux platforms will see such removal. Details of how drivers
370will notice and handle such removals are currently bus-specific, and often
371involve a separate thread.
372
373
374Note that the bus-specific runtime PM wakeup mechanism can exist, and might
375be defined to share some of the same driver code as for system wakeup. For
376example, a bus-specific device driver's resume() method might be used there,
377so it wouldn't only be called from bus.resume() during system-wide wakeup.
378See bus-specific information about how runtime wakeup events are handled.
379
380
381System Devices
382--------------
383System devices follow a slightly different API, which can be found in
384
385 include/linux/sysdev.h
386 drivers/base/sys.c
387
388System devices will only be suspended with interrupts disabled, and after
389all other devices have been suspended. On resume, they will be resumed
390before any other devices, and also with interrupts disabled.
391
392That is, IRQs are disabled, the suspend_late() phase begins, then the
393sysdev_driver.suspend() phase, and the system enters a sleep state. Then
394the sysdev_driver.resume() phase begins, followed by the resume_early()
395phase, after which IRQs are enabled.
396
397Code to actually enter and exit the system-wide low power state sometimes
398involves hardware details that are only known to the boot firmware, and
399may leave a CPU running software (from SRAM or flash memory) that monitors
400the system and manages its wakeup sequence.
401 579
402 580
403Runtime Power Management 581Runtime Power Management
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and
407can offer significant power savings on a running system. These devices 585can offer significant power savings on a running system. These devices
408often support a range of runtime power states, which might use names such 586often support a range of runtime power states, which might use names such
409as "off", "sleep", "idle", "active", and so on. Those states will in some 587as "off", "sleep", "idle", "active", and so on. Those states will in some
410cases (like PCI) be partially constrained by a bus the device uses, and will 588cases (like PCI) be partially constrained by the bus the device uses, and will
411usually include hardware states that are also used in system sleep states. 589usually include hardware states that are also used in system sleep states.
412 590
413However, note that if a driver puts a device into a runtime low power state 591A system-wide power transition can be started while some devices are in low
414and the system then goes into a system-wide sleep state, it normally ought 592power states due to runtime power management. The system sleep PM callbacks
415to resume into that runtime low power state rather than "full on". Such 593should recognize such situations and react to them appropriately, but the
416distinctions would be part of the driver-internal state machine for that 594necessary actions are subsystem-specific.
417hardware; the whole point of runtime power management is to be sure that 595
418drivers are decoupled in that way from the state machine governing phases 596In some cases the decision may be made at the subsystem level while in other
419of the system-wide power/sleep state transitions. 597cases the device driver may be left to decide. In some cases it may be
420 598desirable to leave a suspended device in that state during a system-wide power
421 599transition, but in other cases the device must be put back into the full-power
422Power Saving Techniques 600state temporarily, for example so that its system wakeup capability can be
423----------------------- 601disabled. This all depends on the hardware and the design of the subsystem and
424Normally runtime power management is handled by the drivers without specific 602device driver in question.
425userspace or kernel intervention, by device-aware use of techniques like: 603
426 604During system-wide resume from a sleep state it's best to put devices into the
427 Using information provided by other system layers 605full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to
428 - stay deeply "off" except between open() and close() 606that document for more information regarding this particular issue as well as
429 - if transceiver/PHY indicates "nobody connected", stay "off" 607for information on the device runtime power management framework in general.
430 - application protocols may include power commands or hints
431
432 Using fewer CPU cycles
433 - using DMA instead of PIO
434 - removing timers, or making them lower frequency
435 - shortening "hot" code paths
436 - eliminating cache misses
437 - (sometimes) offloading work to device firmware
438
439 Reducing other resource costs
440 - gating off unused clocks in software (or hardware)
441 - switching off unused power supplies
442 - eliminating (or delaying/merging) IRQs
443 - tuning DMA to use word and/or burst modes
444
445 Using device-specific low power states
446 - using lower voltages
447 - avoiding needless DMA transfers
448
449Read your hardware documentation carefully to see the opportunities that
450may be available. If you can, measure the actual power usage and check
451it against the budget established for your project.
452
453
454Examples: USB hosts, system timer, system CPU
455----------------------------------------------
456USB host controllers make interesting, if complex, examples. In many cases
457these have no work to do: no USB devices are connected, or all of them are
458in the USB "suspend" state. Linux host controller drivers can then disable
459periodic DMA transfers that would otherwise be a constant power drain on the
460memory subsystem, and enter a suspend state. In power-aware controllers,
461entering that suspend state may disable the clock used with USB signaling,
462saving a certain amount of power.
463
464The controller will be woken from that state (with an IRQ) by changes to the
465signal state on the data lines of a given port, for example by an existing
466peripheral requesting "remote wakeup" or by plugging a new peripheral. The
467same wakeup mechanism usually works from "standby" sleep states, and on some
468systems also from "suspend to RAM" (or even "suspend to disk") states.
469(Except that ACPI may be involved instead of normal IRQs, on some hardware.)
470
471System devices like timers and CPUs may have special roles in the platform
472power management scheme. For example, system timers using a "dynamic tick"
473approach don't just save CPU cycles (by eliminating needless timer IRQs),
474but they may also open the door to using lower power CPU "idle" states that
475cost more than a jiffie to enter and exit. On x86 systems these are states
476like "C3"; note that periodic DMA transfers from a USB host controller will
477also prevent entry to a C3 state, much like a periodic timer IRQ.
478
479That kind of runtime mechanism interaction is common. "System On Chip" (SOC)
480processors often have low power idle modes that can't be entered unless
481certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the
482drivers gate those clocks effectively, then the system idle task may be able
483to use the lower power idle modes and thereby increase battery life.
484
485If the CPU can have a "cpufreq" driver, there also may be opportunities
486to shift to lower voltage settings and reduce the power cost of executing
487a given number of instructions. (Without voltage adjustment, it's rare
488for cpufreq to save much power; the cost-per-instruction must go down.)
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index dd8fe43888d3..62328d76b55b 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -1,299 +1,1025 @@
1
2PCI Power Management 1PCI Power Management
3~~~~~~~~~~~~~~~~~~~~
4 2
5An overview of the concepts and the related functions in the Linux kernel 3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4
5An overview of concepts and the Linux kernel's interfaces related to PCI power
6management. Based on previous work by Patrick Mochel <mochel@transmeta.com>
7(and others).
6 8
7Patrick Mochel <mochel@transmeta.com> 9This document only covers the aspects of power management specific to PCI
8(and others) 10devices. For general description of the kernel's interfaces related to device
11power management refer to Documentation/power/devices.txt and
12Documentation/power/runtime_pm.txt.
9 13
10--------------------------------------------------------------------------- 14---------------------------------------------------------------------------
11 15
121. Overview 161. Hardware and Platform Support for PCI Power Management
132. How the PCI Subsystem Does Power Management 172. PCI Subsystem and Device Power Management
143. PCI Utility Functions 183. PCI Device Drivers and Power Management
154. PCI Device Drivers 194. Resources
165. Resources 20
17 21
181. Overview 221. Hardware and Platform Support for PCI Power Management
19~~~~~~~~~~~ 23=========================================================
20 24
21The PCI Power Management Specification was introduced between the PCI 2.1 and 251.1. Native and Platform-Based Power Management
22PCI 2.2 Specifications. It a standard interface for controlling various 26-----------------------------------------------
23power management operations. 27In general, power management is a feature allowing one to save energy by putting
24 28devices into states in which they draw less power (low-power states) at the
25Implementation of the PCI PM Spec is optional, as are several sub-components of 29price of reduced functionality or performance.
26it. If a device supports the PCI PM Spec, the device will have an 8 byte 30
27capability field in its PCI configuration space. This field is used to describe 31Usually, a device is put into a low-power state when it is underutilized or
28and control the standard PCI power management features. 32completely inactive. However, when it is necessary to use the device once
29 33again, it has to be put back into the "fully functional" state (full-power
30The PCI PM spec defines 4 operating states for devices (D0 - D3) and for buses 34state). This may happen when there are some data for the device to handle or
31(B0 - B3). The higher the number, the less power the device consumes. However, 35as a result of an external event requiring the device to be active, which may
32the higher the number, the longer the latency is for the device to return to 36be signaled by the device itself.
33an operational state (D0). 37
34 38PCI devices may be put into low-power states in two ways, by using the device
35There are actually two D3 states. When someone talks about D3, they usually 39capabilities introduced by the PCI Bus Power Management Interface Specification,
36mean D3hot, which corresponds to an ACPI D2 state (power is reduced, the 40or with the help of platform firmware, such as an ACPI BIOS. In the first
37device may lose some context). But they may also mean D3cold, which is an 41approach, that is referred to as the native PCI power management (native PCI PM)
38ACPI D3 state (power is fully off, all state was discarded); or both. 42in what follows, the device power state is changed as a result of writing a
39 43specific value into one of its standard configuration registers. The second
40Bus power management is not covered in this version of this document. 44approach requires the platform firmware to provide special methods that may be
41 45used by the kernel to change the device's power state.
42Note that all PCI devices support D0 and D3cold by default, regardless of 46
43whether or not they implement any of the PCI PM spec. 47Devices supporting the native PCI PM usually can generate wakeup signals called
44 48Power Management Events (PMEs) to let the kernel know about external events
45The possible state transitions that a device can undergo are: 49requiring the device to be active. After receiving a PME the kernel is supposed
46 50to put the device that sent it into the full-power state. However, the PCI Bus
47+---------------------------+ 51Power Management Interface Specification doesn't define any standard method of
48| Current State | New State | 52delivering the PME from the device to the CPU and the operating system kernel.
49+---------------------------+ 53It is assumed that the platform firmware will perform this task and therefore,
50| D0 | D1, D2, D3| 54even though a PCI device is set up to generate PMEs, it also may be necessary to
51+---------------------------+ 55prepare the platform firmware for notifying the CPU of the PMEs coming from the
52| D1 | D2, D3 | 56device (e.g. by generating interrupts).
53+---------------------------+ 57
54| D2 | D3 | 58In turn, if the methods provided by the platform firmware are used for changing
55+---------------------------+ 59the power state of a device, usually the platform also provides a method for
56| D1, D2, D3 | D0 | 60preparing the device to generate wakeup signals. In that case, however, it
57+---------------------------+ 61often also is necessary to prepare the device for generating PMEs using the
58 62native PCI PM mechanism, because the method provided by the platform depends on
59Note that when the system is entering a global suspend state, all devices will 63that.
60be placed into D3 and when resuming, all devices will be placed into D0. 64
61However, when the system is running, other state transitions are possible. 65Thus in many situations both the native and the platform-based power management
62 66mechanisms have to be used simultaneously to obtain the desired result.
632. How The PCI Subsystem Handles Power Management 67
64~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 681.2. Native PCI Power Management
65 69--------------------------------
66The PCI suspend/resume functionality is accessed indirectly via the Power 70The PCI Bus Power Management Interface Specification (PCI PM Spec) was
67Management subsystem. At boot, the PCI driver registers a power management 71introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a
68callback with that layer. Upon entering a suspend state, the PM layer iterates 72standard interface for performing various operations related to power
69through all of its registered callbacks. This currently takes place only during 73management.
70APM state transitions. 74
71 75The implementation of the PCI PM Spec is optional for conventional PCI devices,
72Upon going to sleep, the PCI subsystem walks its device tree twice. Both times, 76but it is mandatory for PCI Express devices. If a device supports the PCI PM
73it does a depth first walk of the device tree. The first walk saves each of the 77Spec, it has an 8 byte power management capability field in its PCI
74device's state and checks for devices that will prevent the system from entering 78configuration space. This field is used to describe and control the standard
75a global power state. The next walk then places the devices in a low power 79features related to the native PCI power management.
80
81The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
82(B0-B3). The higher the number, the less power is drawn by the device or bus
83in that state. However, the higher the number, the longer the latency for
84the device or bus to return to the full-power state (D0 or B0, respectively).
85
86There are two variants of the D3 state defined by the specification. The first
87one is D3hot, referred to as the software accessible D3, because devices can be
88programmed to go into it. The second one, D3cold, is the state that PCI devices
89are in when the supply voltage (Vcc) is removed from them. It is not possible
90to program a PCI device to go into D3cold, although there may be a programmable
91interface for putting the bus the device is on into a state in which Vcc is
92removed from all devices on the bus.
93
94PCI bus power management, however, is not supported by the Linux kernel at the
95time of this writing and therefore it is not covered by this document.
96
97Note that every PCI device can be in the full-power state (D0) or in D3cold,
98regardless of whether or not it implements the PCI PM Spec. In addition to
99that, if the PCI PM Spec is implemented by the device, it must support D3hot
100as well as D0. The support for the D1 and D2 power states is optional.
101
102PCI devices supporting the PCI PM Spec can be programmed to go to any of the
103supported low-power states (except for D3cold). While in D1-D3hot the
104standard configuration registers of the device must be accessible to software
105(i.e. the device is required to respond to PCI configuration accesses), although
106its I/O and memory spaces are then disabled. This allows the device to be
107programmatically put into D0. Thus the kernel can switch the device back and
108forth between D0 and the supported low-power states (except for D3cold) and the
109possible power state transitions the device can undergo are the following:
110
111+----------------------------+
112| Current State | New State |
113+----------------------------+
114| D0 | D1, D2, D3 |
115+----------------------------+
116| D1 | D2, D3 |
117+----------------------------+
118| D2 | D3 |
119+----------------------------+
120| D1, D2, D3 | D0 |
121+----------------------------+
122
123The transition from D3cold to D0 occurs when the supply voltage is provided to
124the device (i.e. power is restored). In that case the device returns to D0 with
125a full power-on reset sequence and the power-on defaults are restored to the
126device by hardware just as at initial power up.
127
128PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
129while in a low-power state (D1-D3), but they are not required to be capable
130of generating PMEs from all supported low-power states. In particular, the
131capability of generating PMEs from D3cold is optional and depends on the
132presence of additional voltage (3.3Vaux) allowing the device to remain
133sufficiently active to generate a wakeup signal.
134
1351.3. ACPI Device Power Management
136---------------------------------
137The platform firmware support for the power management of PCI devices is
138system-specific. However, if the system in question is compliant with the
139Advanced Configuration and Power Interface (ACPI) Specification, like the
140majority of x86-based systems, it is supposed to implement device power
141management interfaces defined by the ACPI standard.
142
143For this purpose the ACPI BIOS provides special functions called "control
144methods" that may be executed by the kernel to perform specific tasks, such as
145putting a device into a low-power state. These control methods are encoded
146using special byte-code language called the ACPI Machine Language (AML) and
147stored in the machine's BIOS. The kernel loads them from the BIOS and executes
148them as needed using an AML interpreter that translates the AML byte code into
149computations and memory or I/O space accesses. This way, in theory, a BIOS
150writer can provide the kernel with a means to perform actions depending
151on the system design in a system-specific fashion.
152
153ACPI control methods may be divided into global control methods, that are not
154associated with any particular devices, and device control methods, that have
155to be defined separately for each device supposed to be handled with the help of
156the platform. This means, in particular, that ACPI device control methods can
157only be used to handle devices that the BIOS writer knew about in advance. The
158ACPI methods used for device power management fall into that category.
159
160The ACPI specification assumes that devices can be in one of four power states
161labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
162D0-D3 states (although the difference between D3hot and D3cold is not taken
163into account by ACPI). Moreover, for each power state of a device there is a
164set of power resources that have to be enabled for the device to be put into
165that state. These power resources are controlled (i.e. enabled or disabled)
166with the help of their own control methods, _ON and _OFF, that have to be
167defined individually for each of them.
168
169To put a device into the ACPI power state Dx (where x is a number between 0 and
1703 inclusive) the kernel is supposed to (1) enable the power resources required
171by the device in this state using their _ON control methods and (2) execute the
172_PSx control method defined for the device. In addition to that, if the device
173is going to be put into a low-power state (D1-D3) and is supposed to generate
174wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
1753.0) control method defined for it has to be executed before _PSx. Power
176resources that are not required by the device in the target power state and are
177not required any more by any other device should be disabled (by executing their
178_OFF control methods). If the current power state of the device is D3, it can
179only be put into D0 this way.
180
181However, quite often the power states of devices are changed during a
182system-wide transition into a sleep state or back into the working state. ACPI
183defines four system sleep states, S1, S2, S3, and S4, and denotes the system
184working state as S0. In general, the target system sleep (or working) state
185determines the highest power (lowest number) state the device can be put
186into and the kernel is supposed to obtain this information by executing the
187device's _SxD control method (where x is a number between 0 and 4 inclusive).
188If the device is required to wake up the system from the target sleep state, the
189lowest power (highest number) state it can be put into is also determined by the
190target state of the system. The kernel is then supposed to use the device's
191_SxW control method to obtain the number of that state. It also is supposed to
192use the device's _PRW control method to learn which power resources need to be
193enabled for the device to be able to generate wakeup signals.
194
1951.4. Wakeup Signaling
196---------------------
197Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
198a result of the execution of the _DSW (or _PSW) ACPI control method before
199putting the device into a low-power state, have to be caught and handled as
200appropriate. If they are sent while the system is in the working state
201(ACPI S0), they should be translated into interrupts so that the kernel can
202put the devices generating them into the full-power state and take care of the
203events that triggered them. In turn, if they are sent while the system is
204sleeping, they should cause the system's core logic to trigger wakeup.
205
206On ACPI-based systems wakeup signals sent by conventional PCI devices are
207converted into ACPI General-Purpose Events (GPEs) which are hardware signals
208from the system core logic generated in response to various events that need to
209be acted upon. Every GPE is associated with one or more sources of potentially
210interesting events. In particular, a GPE may be associated with a PCI device
211capable of signaling wakeup. The information on the connections between GPEs
212and event sources is recorded in the system's ACPI BIOS from where it can be
213read by the kernel.
214
215If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
216associated with it (if there is one) is triggered. The GPEs associated with PCI
217bridges may also be triggered in response to a wakeup signal from one of the
218devices below the bridge (this also is the case for root bridges) and, for
219example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
220handled this way.
221
222A GPE may be triggered when the system is sleeping (i.e. when it is in one of
223the ACPI S1-S4 states), in which case system wakeup is started by its core logic
224(the device that was the source of the signal causing the system wakeup to occur
225may be identified later). The GPEs used in such situations are referred to as
226wakeup GPEs.
227
228Usually, however, GPEs are also triggered when the system is in the working
229state (ACPI S0) and in that case the system's core logic generates a System
230Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI
231handler identifies the GPE that caused the interrupt to be generated which,
232in turn, allows the kernel to identify the source of the event (that may be
233a PCI device signaling wakeup). The GPEs used for notifying the kernel of
234events occurring while the system is in the working state are referred to as
235runtime GPEs.
236
237Unfortunately, there is no standard way of handling wakeup signals sent by
238conventional PCI devices on systems that are not ACPI-based, but there is one
239for PCI Express devices. Namely, the PCI Express Base Specification introduced
240a native mechanism for converting native PCI PMEs into interrupts generated by
241root ports. For conventional PCI devices native PMEs are out-of-band, so they
242are routed separately and they need not pass through bridges (in principle they
243may be routed directly to the system's core logic), but for PCI Express devices
244they are in-band messages that have to pass through the PCI Express hierarchy,
245including the root port on the path from the device to the Root Complex. Thus
246it was possible to introduce a mechanism by which a root port generates an
247interrupt whenever it receives a PME message from one of the devices below it.
248The PCI Express Requester ID of the device that sent the PME message is then
249recorded in one of the root port's configuration registers from where it may be
250read by the interrupt handler allowing the device to be identified. [PME
251messages sent by PCI Express endpoints integrated with the Root Complex don't
252pass through root ports, but instead they cause a Root Complex Event Collector
253(if there is one) to generate interrupts.]
254
255In principle the native PCI Express PME signaling may also be used on ACPI-based
256systems along with the GPEs, but to use it the kernel has to ask the system's
257ACPI BIOS to release control of root port configuration registers. The ACPI
258BIOS, however, is not required to allow the kernel to control these registers
259and if it doesn't do that, the kernel must not modify their contents. Of course
260the native PCI Express PME signaling cannot be used by the kernel in that case.
261
262
2632. PCI Subsystem and Device Power Management
264============================================
265
2662.1. Device Power Management Callbacks
267--------------------------------------
268The PCI Subsystem participates in the power management of PCI devices in a
269number of ways. First of all, it provides an intermediate code layer between
270the device power management core (PM core) and PCI device drivers.
271Specifically, the pm field of the PCI subsystem's struct bus_type object,
272pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
273pointers to several device power management callbacks:
274
275const struct dev_pm_ops pci_dev_pm_ops = {
276 .prepare = pci_pm_prepare,
277 .complete = pci_pm_complete,
278 .suspend = pci_pm_suspend,
279 .resume = pci_pm_resume,
280 .freeze = pci_pm_freeze,
281 .thaw = pci_pm_thaw,
282 .poweroff = pci_pm_poweroff,
283 .restore = pci_pm_restore,
284 .suspend_noirq = pci_pm_suspend_noirq,
285 .resume_noirq = pci_pm_resume_noirq,
286 .freeze_noirq = pci_pm_freeze_noirq,
287 .thaw_noirq = pci_pm_thaw_noirq,
288 .poweroff_noirq = pci_pm_poweroff_noirq,
289 .restore_noirq = pci_pm_restore_noirq,
290 .runtime_suspend = pci_pm_runtime_suspend,
291 .runtime_resume = pci_pm_runtime_resume,
292 .runtime_idle = pci_pm_runtime_idle,
293};
294
295These callbacks are executed by the PM core in various situations related to
296device power management and they, in turn, execute power management callbacks
297provided by PCI device drivers. They also perform power management operations
298involving some standard configuration registers of PCI devices that device
299drivers need not know or care about.
300
301The structure representing a PCI device, struct pci_dev, contains several fields
302that these callbacks operate on:
303
304struct pci_dev {
305 ...
306 pci_power_t current_state; /* Current operating state. */
307 int pm_cap; /* PM capability offset in the
308 configuration space */
309 unsigned int pme_support:5; /* Bitmask of states from which PME#
310 can be generated */
311 unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */
312 unsigned int d1_support:1; /* Low power state D1 is supported */
313 unsigned int d2_support:1; /* Low power state D2 is supported */
314 unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
315 unsigned int wakeup_prepared:1; /* Device prepared for wake up */
316 unsigned int d3_delay; /* D3->D0 transition time in ms */
317 ...
318};
319
320They also indirectly use some fields of the struct device that is embedded in
321struct pci_dev.
322
3232.2. Device Initialization
324--------------------------
325The PCI subsystem's first task related to device power management is to
326prepare the device for power management and initialize the fields of struct
327pci_dev used for this purpose. This happens in two functions defined in
328drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
329
330The first of these functions checks if the device supports native PCI PM
331and if that's the case the offset of its power management capability structure
332in the configuration space is stored in the pm_cap field of the device's struct
333pci_dev object. Next, the function checks which PCI low-power states are
334supported by the device and from which low-power states the device can generate
335native PCI PMEs. The power management fields of the device's struct pci_dev and
336the struct device embedded in it are updated accordingly and the generation of
337PMEs by the device is disabled.
338
339The second function checks if the device can be prepared to signal wakeup with
340the help of the platform firmware, such as the ACPI BIOS. If that is the case,
341the function updates the wakeup fields in struct device embedded in the
342device's struct pci_dev and uses the firmware-provided method to prevent the
343device from signaling wakeup.
344
345At this point the device is ready for power management. For driverless devices,
346however, this functionality is limited to a few basic operations carried out
347during system-wide transitions to a sleep state and back to the working state.
348
3492.3. Runtime Device Power Management
350------------------------------------
351The PCI subsystem plays a vital role in the runtime power management of PCI
352devices. For this purpose it uses the general runtime power management
353(runtime PM) framework described in Documentation/power/runtime_pm.txt.
354Namely, it provides subsystem-level callbacks:
355
356 pci_pm_runtime_suspend()
357 pci_pm_runtime_resume()
358 pci_pm_runtime_idle()
359
360that are executed by the core runtime PM routines. It also implements the
361entire mechanics necessary for handling runtime wakeup signals from PCI devices
362in low-power states, which at the time of this writing works for both the native
363PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
364Section 1.
365
366First, a PCI device is put into a low-power state, or suspended, with the help
367of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
368pci_pm_runtime_suspend() to do the actual job. For this to work, the device's
369driver has to provide a pm->runtime_suspend() callback (see below), which is
370run by pci_pm_runtime_suspend() as the first action. If the driver's callback
371returns successfully, the device's standard configuration registers are saved,
372the device is prepared to generate wakeup signals and, finally, it is put into
373the target low-power state.
374
375The low-power state to put the device into is the lowest-power (highest number)
376state from which it can signal wakeup. The exact method of signaling wakeup is
377system-dependent and is determined by the PCI subsystem on the basis of the
378reported capabilities of the device and the platform firmware. To prepare the
379device for signaling wakeup and put it into the selected low-power state, the
380PCI subsystem can use the platform firmware as well as the device's native PCI
381PM capabilities, if supported.
382
383It is expected that the device driver's pm->runtime_suspend() callback will
384not attempt to prepare the device for signaling wakeup or to put it into a
385low-power state. The driver ought to leave these tasks to the PCI subsystem
386that has all of the information necessary to perform them.
387
388A suspended device is brought back into the "active" state, or resumed,
389with the help of pm_request_resume() or pm_runtime_resume() which both call
390pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's
391driver provides a pm->runtime_resume() callback (see below). However, before
392the driver's callback is executed, pci_pm_runtime_resume() brings the device
393back into the full-power state, prevents it from signaling wakeup while in that
394state and restores its standard configuration registers. Thus the driver's
395callback need not worry about the PCI-specific aspects of the device resume.
396
397Note that generally pci_pm_runtime_resume() may be called in two different
398situations. First, it may be called at the request of the device's driver, for
399example if there are some data for it to process. Second, it may be called
400as a result of a wakeup signal from the device itself (this sometimes is
401referred to as "remote wakeup"). Of course, for this purpose the wakeup signal
402is handled in one of the ways described in Section 1 and finally converted into
403a notification for the PCI subsystem after the source device has been
404identified.
405
406The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
407and pm_request_idle(), executes the device driver's pm->runtime_idle()
408callback, if defined, and if that callback doesn't return error code (or is not
409present at all), suspends the device with the help of pm_runtime_suspend().
410Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
411example, it is called right after the device has just been resumed), in which
412cases it is expected to suspend the device if that makes sense. Usually,
413however, the PCI subsystem doesn't really know if the device really can be
414suspended, so it lets the device's driver decide by running its
415pm->runtime_idle() callback.
416
4172.4. System-Wide Power Transitions
418----------------------------------
419There are a few different types of system-wide power transitions, described in
420Documentation/power/devices.txt. Each of them requires devices to be handled
421in a specific way and the PM core executes subsystem-level power management
422callbacks for this purpose. They are executed in phases such that each phase
423involves executing the same subsystem-level callback for every device belonging
424to the given subsystem before the next phase begins. These phases always run
425after tasks have been frozen.
426
4272.4.1. System Suspend
428
429When the system is going into a sleep state in which the contents of memory will
430be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
431
432 prepare, suspend, suspend_noirq.
433
434The following PCI bus type's callbacks, respectively, are used in these phases:
435
436 pci_pm_prepare()
437 pci_pm_suspend()
438 pci_pm_suspend_noirq()
439
440The pci_pm_prepare() routine first puts the device into the "fully functional"
441state with the help of pm_runtime_resume(). Then, it executes the device
442driver's pm->prepare() callback if defined (i.e. if the driver's struct
443dev_pm_ops object is present and the prepare pointer in that object is valid).
444
445The pci_pm_suspend() routine first checks if the device's driver implements
446legacy PCI suspend routines (see Section 3), in which case the driver's legacy
447suspend callback is executed, if present, and its result is returned. Next, if
448the device's driver doesn't provide a struct dev_pm_ops object (containing
449pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
450simply turns off the device's bus master capability and runs
451pcibios_disable_device() to disable it, unless the device is a bridge (PCI
452bridges are ignored by this routine). Next, the device driver's pm->suspend()
453callback is executed, if defined, and its result is returned if it fails.
454Finally, pci_fixup_device() is called to apply hardware suspend quirks related
455to the device if necessary.
456
457Note that the suspend phase is carried out asynchronously for PCI devices, so
458the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
459devices that don't depend on each other in a known way (i.e. none of the paths
460in the device tree from the root bridge to a leaf device contains both of them).
461
462The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
463been called, which means that the device driver's interrupt handler won't be
464invoked while this routine is running. It first checks if the device's driver
465implements legacy PCI suspends routines (Section 3), in which case the legacy
466late suspend routine is called and its result is returned (the standard
467configuration registers of the device are saved if the driver's callback hasn't
468done that). Second, if the device driver's struct dev_pm_ops object is not
469present, the device's standard configuration registers are saved and the routine
470returns success. Otherwise the device driver's pm->suspend_noirq() callback is
471executed, if present, and its result is returned if it fails. Next, if the
472device's standard configuration registers haven't been saved yet (one of the
473device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
474saves them, prepares the device to signal wakeup (if necessary) and puts it into
475a low-power state.
476
477The low-power state to put the device into is the lowest-power (highest number)
478state from which it can signal wakeup while the system is in the target sleep
479state. Just like in the runtime PM case described above, the mechanism of
480signaling wakeup is system-dependent and determined by the PCI subsystem, which
481is also responsible for preparing the device to signal wakeup from the system's
482target sleep state as appropriate.
483
484PCI device drivers (that don't implement legacy power management callbacks) are
485generally not expected to prepare devices for signaling wakeup or to put them
486into low-power states. However, if one of the driver's suspend callbacks
487(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
488registers, pci_pm_suspend_noirq() will assume that the device has been prepared
489to signal wakeup and put into a low-power state by the driver (the driver is
490then assumed to have used the helper functions provided by the PCI subsystem for
491this purpose). PCI device drivers are not encouraged to do that, but in some
492rare cases doing that in the driver may be the optimum approach.
493
4942.4.2. System Resume
495
496When the system is undergoing a transition from a sleep state in which the
497contents of memory have been preserved, such as one of the ACPI sleep states
498S1-S3, into the working state (ACPI S0), the phases are:
499
500 resume_noirq, resume, complete.
501
502The following PCI bus type's callbacks, respectively, are executed in these
503phases:
504
505 pci_pm_resume_noirq()
506 pci_pm_resume()
507 pci_pm_complete()
508
509The pci_pm_resume_noirq() routine first puts the device into the full-power
510state, restores its standard configuration registers and applies early resume
511hardware quirks related to the device, if necessary. This is done
512unconditionally, regardless of whether or not the device's driver implements
513legacy PCI power management callbacks (this way all PCI devices are in the
514full-power state and their standard configuration registers have been restored
515when their interrupt handlers are invoked for the first time during resume,
516which allows the kernel to avoid problems with the handling of shared interrupts
517by drivers whose devices are still suspended). If legacy PCI power management
518callbacks (see Section 3) are implemented by the device's driver, the legacy
519early resume callback is executed and its result is returned. Otherwise, the
520device driver's pm->resume_noirq() callback is executed, if defined, and its
521result is returned.
522
523The pci_pm_resume() routine first checks if the device's standard configuration
524registers have been restored and restores them if that's not the case (this
525only is necessary in the error path during a failing suspend). Next, resume
526hardware quirks related to the device are applied, if necessary, and if the
527device's driver implements legacy PCI power management callbacks (see
528Section 3), the driver's legacy resume callback is executed and its result is
529returned. Otherwise, the device's wakeup signaling mechanisms are blocked and
530its driver's pm->resume() callback is executed, if defined (the callback's
531result is then returned).
532
533The resume phase is carried out asynchronously for PCI devices, like the
534suspend phase described above, which means that if two PCI devices don't depend
535on each other in a known way, the pci_pm_resume() routine may be executed for
536the both of them in parallel.
537
538The pci_pm_complete() routine only executes the device driver's pm->complete()
539callback, if defined.
540
5412.4.3. System Hibernation
542
543System hibernation is more complicated than system suspend, because it requires
544a system image to be created and written into a persistent storage medium. The
545image is created atomically and all devices are quiesced, or frozen, before that
546happens.
547
548The freezing of devices is carried out after enough memory has been freed (at
549the time of this writing the image creation requires at least 50% of system RAM
550to be free) in the following three phases:
551
552 prepare, freeze, freeze_noirq
553
554that correspond to the PCI bus type's callbacks:
555
556 pci_pm_prepare()
557 pci_pm_freeze()
558 pci_pm_freeze_noirq()
559
560This means that the prepare phase is exactly the same as for system suspend.
561The other two phases, however, are different.
562
563The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
564the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
565and it doesn't apply the suspend-related hardware quirks. It is executed
566asynchronously for different PCI devices that don't depend on each other in a
567known way.
568
569The pci_pm_freeze_noirq() routine, in turn, is similar to
570pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
571routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the
572device for signaling wakeup and put it into a low-power state. Still, it saves
573the device's standard configuration registers if they haven't been saved by one
574of the driver's callbacks.
575
576Once the image has been created, it has to be saved. However, at this point all
577devices are frozen and they cannot handle I/O, while their ability to handle
578I/O is obviously necessary for the image saving. Thus they have to be brought
579back to the fully functional state and this is done in the following phases:
580
581 thaw_noirq, thaw, complete
582
583using the following PCI bus type's callbacks:
584
585 pci_pm_thaw_noirq()
586 pci_pm_thaw()
587 pci_pm_complete()
588
589respectively.
590
591The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
592but it doesn't put the device into the full power state and doesn't attempt to
593restore its standard configuration registers. It also executes the device
594driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
595
596The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
597driver's pm->thaw() callback instead of pm->resume(). It is executed
598asynchronously for different PCI devices that don't depend on each other in a
599known way.
600
601The complete phase it the same as for system resume.
602
603After saving the image, devices need to be powered down before the system can
604enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in
605three phases:
606
607 prepare, poweroff, poweroff_noirq
608
609where the prepare phase is exactly the same as for system suspend. The other
610two phases are analogous to the suspend and suspend_noirq phases, respectively.
611The PCI subsystem-level callbacks they correspond to
612
613 pci_pm_poweroff()
614 pci_pm_poweroff_noirq()
615
616work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
617although they don't attempt to save the device's standard configuration
618registers.
619
6202.4.4. System Restore
621
622System restore requires a hibernation image to be loaded into memory and the
623pre-hibernation memory contents to be restored before the pre-hibernation system
624activity can be resumed.
625
626As described in Documentation/power/devices.txt, the hibernation image is loaded
627into memory by a fresh instance of the kernel, called the boot kernel, which in
628turn is loaded and run by a boot loader in the usual way. After the boot kernel
629has loaded the image, it needs to replace its own code and data with the code
630and data of the "hibernated" kernel stored within the image, called the image
631kernel. For this purpose all devices are frozen just like before creating
632the image during hibernation, in the
633
634 prepare, freeze, freeze_noirq
635
636phases described above. However, the devices affected by these phases are only
637those having drivers in the boot kernel; other devices will still be in whatever
638state the boot loader left them.
639
640Should the restoration of the pre-hibernation memory contents fail, the boot
641kernel would go through the "thawing" procedure described above, using the
642thaw_noirq, thaw, and complete phases (that will only affect the devices having
643drivers in the boot kernel), and then continue running normally.
644
645If the pre-hibernation memory contents are restored successfully, which is the
646usual situation, control is passed to the image kernel, which then becomes
647responsible for bringing the system back to the working state. To achieve this,
648it must restore the devices' pre-hibernation functionality, which is done much
649like waking up from the memory sleep state, although it involves different
650phases:
651
652 restore_noirq, restore, complete
653
654The first two of these are analogous to the resume_noirq and resume phases
655described above, respectively, and correspond to the following PCI subsystem
656callbacks:
657
658 pci_pm_restore_noirq()
659 pci_pm_restore()
660
661These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
662respectively, but they execute the device driver's pm->restore_noirq() and
663pm->restore() callbacks, if available.
664
665The complete phase is carried out in exactly the same way as during system
666resume.
667
668
6693. PCI Device Drivers and Power Management
670==========================================
671
6723.1. Power Management Callbacks
673-------------------------------
674PCI device drivers participate in power management by providing callbacks to be
675executed by the PCI subsystem's power management routines described above and by
676controlling the runtime power management of their devices.
677
678At the time of this writing there are two ways to define power management
679callbacks for a PCI device driver, the recommended one, based on using a
680dev_pm_ops structure described in Documentation/power/devices.txt, and the
681"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
682.resume() callbacks from struct pci_driver are used. The legacy approach,
683however, doesn't allow one to define runtime power management callbacks and is
684not really suitable for any new drivers. Therefore it is not covered by this
685document (refer to the source code to learn more about it).
686
687It is recommended that all PCI device drivers define a struct dev_pm_ops object
688containing pointers to power management (PM) callbacks that will be executed by
689the PCI subsystem's PM routines in various circumstances. A pointer to the
690driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
691its struct pci_driver object. Once that has happened, the "legacy" PM callbacks
692in struct pci_driver are ignored (even if they are not NULL).
693
694The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
695defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
696subsystem will handle the device in a simplified default manner. If they are
697defined, though, they are expected to behave as described in the following
698subsections.
699
7003.1.1. prepare()
701
702The prepare() callback is executed during system suspend, during hibernation
703(when a hibernation image is about to be created), during power-off after
704saving a hibernation image and during system restore, when a hibernation image
705has just been loaded into memory.
706
707This callback is only necessary if the driver's device has children that in
708general may be registered at any time. In that case the role of the prepare()
709callback is to prevent new children of the device from being registered until
710one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
711
712In addition to that the prepare() callback may carry out some operations
713preparing the device to be suspended, although it should not allocate memory
714(if additional memory is required to suspend the device, it has to be
715preallocated earlier, for example in a suspend/hibernate notifier as described
716in Documentation/power/notifiers.txt).
717
7183.1.2. suspend()
719
720The suspend() callback is only executed during system suspend, after prepare()
721callbacks have been executed for all devices in the system.
722
723This callback is expected to quiesce the device and prepare it to be put into a
724low-power state by the PCI subsystem. It is not required (in fact it even is
725not recommended) that a PCI driver's suspend() callback save the standard
726configuration registers of the device, prepare it for waking up the system, or
727put it into a low-power state. All of these operations can very well be taken
728care of by the PCI subsystem, without the driver's participation.
729
730However, in some rare case it is convenient to carry out these operations in
731a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and
732pci_set_power_state() should be used to save the device's standard configuration
733registers, to prepare it for system wakeup (if necessary), and to put it into a
734low-power state, respectively. Moreover, if the driver calls pci_save_state(),
735the PCI subsystem will not execute either pci_prepare_to_sleep(), or
736pci_set_power_state() for its device, so the driver is then responsible for
737handling the device as appropriate.
738
739While the suspend() callback is being executed, the driver's interrupt handler
740can be invoked to handle an interrupt from the device, so all suspend-related
741operations relying on the driver's ability to handle interrupts should be
742carried out in this callback.
743
7443.1.3. suspend_noirq()
745
746The suspend_noirq() callback is only executed during system suspend, after
747suspend() callbacks have been executed for all devices in the system and
748after device interrupts have been disabled by the PM core.
749
750The difference between suspend_noirq() and suspend() is that the driver's
751interrupt handler will not be invoked while suspend_noirq() is running. Thus
752suspend_noirq() can carry out operations that would cause race conditions to
753arise if they were performed in suspend().
754
7553.1.4. freeze()
756
757The freeze() callback is hibernation-specific and is executed in two situations,
758during hibernation, after prepare() callbacks have been executed for all devices
759in preparation for the creation of a system image, and during restore,
760after a system image has been loaded into memory from persistent storage and the
761prepare() callbacks have been executed for all devices.
762
763The role of this callback is analogous to the role of the suspend() callback
764described above. In fact, they only need to be different in the rare cases when
765the driver takes the responsibility for putting the device into a low-power
76state. 766state.
77 767
78The first walk allows a graceful recovery in the event of a failure, since none 768In that cases the freeze() callback should not prepare the device system wakeup
79of the devices have actually been powered down. 769or put it into a low-power state. Still, either it or freeze_noirq() should
80 770save the device's standard configuration registers using pci_save_state().
81In both walks, in particular the second, all children of a bridge are touched
82before the actual bridge itself. This allows the bridge to retain power while
83its children are being accessed.
84
85Upon resuming from sleep, just the opposite must be true: all bridges must be
86powered on and restored before their children are powered on. This is easily
87accomplished with a breadth-first walk of the PCI device tree.
88
89
903. PCI Utility Functions
91~~~~~~~~~~~~~~~~~~~~~~~~
92
93These are helper functions designed to be called by individual device drivers.
94Assuming that a device behaves as advertised, these should be applicable in most
95cases. However, results may vary.
96
97Note that these functions are never implicitly called for the driver. The driver
98is always responsible for deciding when and if to call these.
99
100
101pci_save_state
102--------------
103
104Usage:
105 pci_save_state(struct pci_dev *dev);
106
107Description:
108 Save first 64 bytes of PCI config space, along with any additional
109 PCI-Express or PCI-X information.
110
111
112pci_restore_state
113-----------------
114
115Usage:
116 pci_restore_state(struct pci_dev *dev);
117
118Description:
119 Restore previously saved config space.
120
121
122pci_set_power_state
123-------------------
124
125Usage:
126 pci_set_power_state(struct pci_dev *dev, pci_power_t state);
127
128Description:
129 Transition device to low power state using PCI PM Capabilities
130 registers.
131
132 Will fail under one of the following conditions:
133 - If state is less than current state, but not D0 (illegal transition)
134 - Device doesn't support PM Capabilities
135 - Device does not support requested state
136
137
138pci_enable_wake
139---------------
140
141Usage:
142 pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
143
144Description:
145 Enable device to generate PME# during low power state using PCI PM
146 Capabilities.
147
148 Checks whether if device supports generating PME# from requested state
149 and fail if it does not, unless enable == 0 (request is to disable wake
150 events, which is implicit if it doesn't even support it in the first
151 place).
152
153 Note that the PMC Register in the device's PM Capabilities has a bitmask
154 of the states it supports generating PME# from. D3hot is bit 3 and
155 D3cold is bit 4. So, while a value of 4 as the state may not seem
156 semantically correct, it is.
157
158
1594. PCI Device Drivers
160~~~~~~~~~~~~~~~~~~~~~
161
162These functions are intended for use by individual drivers, and are defined in
163struct pci_driver:
164
165 int (*suspend) (struct pci_dev *dev, pm_message_t state);
166 int (*resume) (struct pci_dev *dev);
167
168
169suspend
170-------
171
172Usage:
173
174if (dev->driver && dev->driver->suspend)
175 dev->driver->suspend(dev,state);
176
177A driver uses this function to actually transition the device into a low power
178state. This should include disabling I/O, IRQs, and bus-mastering, as well as
179physically transitioning the device to a lower power state; it may also include
180calls to pci_enable_wake().
181
182Bus mastering may be disabled by doing:
183
184pci_disable_device(dev);
185
186For devices that support the PCI PM Spec, this may be used to set the device's
187power state to match the suspend() parameter:
188
189pci_set_power_state(dev,state);
190
191The driver is also responsible for disabling any other device-specific features
192(e.g blanking screen, turning off on-card memory, etc).
193
194The driver should be sure to track the current state of the device, as it may
195obviate the need for some operations.
196
197The driver should update the current_state field in its pci_dev structure in
198this function, except for PM-capable devices when pci_set_power_state is used.
199
200resume
201------
202
203Usage:
204
205if (dev->driver && dev->driver->resume)
206 dev->driver->resume(dev)
207 771
208The resume callback may be called from any power state, and is always meant to 7723.1.5. freeze_noirq()
209transition the device to the D0 state.
210 773
211The driver is responsible for reenabling any features of the device that had 774The freeze_noirq() callback is hibernation-specific. It is executed during
212been disabled during previous suspend calls, such as IRQs and bus mastering, 775hibernation, after prepare() and freeze() callbacks have been executed for all
213as well as calling pci_restore_state(). 776devices in preparation for the creation of a system image, and during restore,
777after a system image has been loaded into memory and after prepare() and
778freeze() callbacks have been executed for all devices. It is always executed
779after device interrupts have been disabled by the PM core.
214 780
215If the device is currently in D3, it may need to be reinitialized in resume(). 781The role of this callback is analogous to the role of the suspend_noirq()
782callback described above and it very rarely is necessary to define
783freeze_noirq().
216 784
217 * Some types of devices, like bus controllers, will preserve context in D3hot 785The difference between freeze_noirq() and freeze() is analogous to the
218 (using Vcc power). Their drivers will often want to avoid re-initializing 786difference between suspend_noirq() and suspend().
219 them after re-entering D0 (perhaps to avoid resetting downstream devices).
220 787
221 * Other kinds of devices in D3hot will discard device context as part of a 7883.1.6. poweroff()
222 soft reset when re-entering the D0 state.
223
224 * Devices resuming from D3cold always go through a power-on reset. Some
225 device context can also be preserved using Vaux power.
226 789
227 * Some systems hide D3cold resume paths from drivers. For example, on PCs 790The poweroff() callback is hibernation-specific. It is executed when the system
228 the resume path for suspend-to-disk often runs BIOS powerup code, which 791is about to be powered off after saving a hibernation image to a persistent
229 will sometimes re-initialize the device. 792storage. prepare() callbacks are executed for all devices before poweroff() is
793called.
230 794
231To handle resets during D3 to D0 transitions, it may be convenient to share 795The role of this callback is analogous to the role of the suspend() and freeze()
232device initialization code between probe() and resume(). Device parameters 796callbacks described above, although it does not need to save the contents of
233can also be saved before the driver suspends into D3, avoiding re-probe. 797the device's registers. In particular, if the driver wants to put the device
798into a low-power state itself instead of allowing the PCI subsystem to do that,
799the poweroff() callback should use pci_prepare_to_sleep() and
800pci_set_power_state() to prepare the device for system wakeup and to put it
801into a low-power state, respectively, but it need not save the device's standard
802configuration registers.
234 803
235If the device supports the PCI PM Spec, it can use this to physically transition 8043.1.7. poweroff_noirq()
236the device to D0:
237 805
238pci_set_power_state(dev,0); 806The poweroff_noirq() callback is hibernation-specific. It is executed after
807poweroff() callbacks have been executed for all devices in the system.
239 808
240Note that if the entire system is transitioning out of a global sleep state, all 809The role of this callback is analogous to the role of the suspend_noirq() and
241devices will be placed in the D0 state, so this is not necessary. However, in 810freeze_noirq() callbacks described above, but it does not need to save the
242the event that the device is placed in the D3 state during normal operation, 811contents of the device's registers.
243this call is necessary. It is impossible to determine which of the two events is
244taking place in the driver, so it is always a good idea to make that call.
245 812
246The driver should take note of the state that it is resuming from in order to 813The difference between poweroff_noirq() and poweroff() is analogous to the
247ensure correct (and speedy) operation. 814difference between suspend_noirq() and suspend().
248 815
249The driver should update the current_state field in its pci_dev structure in 8163.1.8. resume_noirq()
250this function, except for PM-capable devices when pci_set_power_state is used.
251 817
818The resume_noirq() callback is only executed during system resume, after the
819PM core has enabled the non-boot CPUs. The driver's interrupt handler will not
820be invoked while resume_noirq() is running, so this callback can carry out
821operations that might race with the interrupt handler.
252 822
823Since the PCI subsystem unconditionally puts all devices into the full power
824state in the resume_noirq phase of system resume and restores their standard
825configuration registers, resume_noirq() is usually not necessary. In general
826it should only be used for performing operations that would lead to race
827conditions if carried out by resume().
253 828
254A reference implementation 8293.1.9. resume()
255-------------------------
256.suspend()
257{
258 /* driver specific operations */
259 830
260 /* Disable IRQ */ 831The resume() callback is only executed during system resume, after
261 free_irq(); 832resume_noirq() callbacks have been executed for all devices in the system and
262 /* If using MSI */ 833device interrupts have been enabled by the PM core.
263 pci_disable_msi();
264 834
265 pci_save_state(); 835This callback is responsible for restoring the pre-suspend configuration of the
266 pci_enable_wake(); 836device and bringing it back to the fully functional state. The device should be
267 /* Disable IO/bus master/irq router */ 837able to process I/O in a usual way after resume() has returned.
268 pci_disable_device();
269 pci_set_power_state(pci_choose_state());
270}
271 838
272.resume() 8393.1.10. thaw_noirq()
273{
274 pci_set_power_state(PCI_D0);
275 pci_restore_state();
276 /* device's irq possibly is changed, driver should take care */
277 pci_enable_device();
278 pci_set_master();
279 840
280 /* if using MSI, device's vector possibly is changed */ 841The thaw_noirq() callback is hibernation-specific. It is executed after a
281 pci_enable_msi(); 842system image has been created and the non-boot CPUs have been enabled by the PM
843core, in the thaw_noirq phase of hibernation. It also may be executed if the
844loading of a hibernation image fails during system restore (it is then executed
845after enabling the non-boot CPUs). The driver's interrupt handler will not be
846invoked while thaw_noirq() is running.
282 847
283 request_irq(); 848The role of this callback is analogous to the role of resume_noirq(). The
284 /* driver specific operations; */ 849difference between these two callbacks is that thaw_noirq() is executed after
285} 850freeze() and freeze_noirq(), so in general it does not need to modify the
851contents of the device's registers.
286 852
287This is a typical implementation. Drivers can slightly change the order 8533.1.11. thaw()
288of the operations in the implementation, ignore some operations or add
289more driver specific operations in it, but drivers should do something like
290this on the whole.
291 854
2925. Resources 855The thaw() callback is hibernation-specific. It is executed after thaw_noirq()
293~~~~~~~~~~~~ 856callbacks have been executed for all devices in the system and after device
857interrupts have been enabled by the PM core.
294 858
295PCI Local Bus Specification 859This callback is responsible for restoring the pre-freeze configuration of
296PCI Bus Power Management Interface Specification 860the device, so that it will work in a usual way after thaw() has returned.
297 861
298 http://www.pcisig.com 8623.1.12. restore_noirq()
299 863
864The restore_noirq() callback is hibernation-specific. It is executed in the
865restore_noirq phase of hibernation, when the boot kernel has passed control to
866the image kernel and the non-boot CPUs have been enabled by the image kernel's
867PM core.
868
869This callback is analogous to resume_noirq() with the exception that it cannot
870make any assumption on the previous state of the device, even if the BIOS (or
871generally the platform firmware) is known to preserve that state over a
872suspend-resume cycle.
873
874For the vast majority of PCI device drivers there is no difference between
875resume_noirq() and restore_noirq().
876
8773.1.13. restore()
878
879The restore() callback is hibernation-specific. It is executed after
880restore_noirq() callbacks have been executed for all devices in the system and
881after the PM core has enabled device drivers' interrupt handlers to be invoked.
882
883This callback is analogous to resume(), just like restore_noirq() is analogous
884to resume_noirq(). Consequently, the difference between restore_noirq() and
885restore() is analogous to the difference between resume_noirq() and resume().
886
887For the vast majority of PCI device drivers there is no difference between
888resume() and restore().
889
8903.1.14. complete()
891
892The complete() callback is executed in the following situations:
893 - during system resume, after resume() callbacks have been executed for all
894 devices,
895 - during hibernation, before saving the system image, after thaw() callbacks
896 have been executed for all devices,
897 - during system restore, when the system is going back to its pre-hibernation
898 state, after restore() callbacks have been executed for all devices.
899It also may be executed if the loading of a hibernation image into memory fails
900(in that case it is run after thaw() callbacks have been executed for all
901devices that have drivers in the boot kernel).
902
903This callback is entirely optional, although it may be necessary if the
904prepare() callback performs operations that need to be reversed.
905
9063.1.15. runtime_suspend()
907
908The runtime_suspend() callback is specific to device runtime power management
909(runtime PM). It is executed by the PM core's runtime PM framework when the
910device is about to be suspended (i.e. quiesced and put into a low-power state)
911at run time.
912
913This callback is responsible for freezing the device and preparing it to be
914put into a low-power state, but it must allow the PCI subsystem to perform all
915of the PCI-specific actions necessary for suspending the device.
916
9173.1.16. runtime_resume()
918
919The runtime_resume() callback is specific to device runtime PM. It is executed
920by the PM core's runtime PM framework when the device is about to be resumed
921(i.e. put into the full-power state and programmed to process I/O normally) at
922run time.
923
924This callback is responsible for restoring the normal functionality of the
925device after it has been put into the full-power state by the PCI subsystem.
926The device is expected to be able to process I/O in the usual way after
927runtime_resume() has returned.
928
9293.1.17. runtime_idle()
930
931The runtime_idle() callback is specific to device runtime PM. It is executed
932by the PM core's runtime PM framework whenever it may be desirable to suspend
933the device according to the PM core's information. In particular, it is
934automatically executed right after runtime_resume() has returned in case the
935resume of the device has happened as a result of a spurious event.
936
937This callback is optional, but if it is not implemented or if it returns 0, the
938PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
939cause the driver's runtime_suspend() callback to be executed.
940
9413.1.18. Pointing Multiple Callback Pointers to One Routine
942
943Although in principle each of the callbacks described in the previous
944subsections can be defined as a separate function, it often is convenient to
945point two or more members of struct dev_pm_ops to the same routine. There are
946a few convenience macros that can be used for this purpose.
947
948The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
949suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
950members and one resume routine pointed to by the .resume(), .thaw(), and
951.restore() members. The other function pointers in this struct dev_pm_ops are
952unset.
953
954The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
955additionally sets the .runtime_resume() pointer to the same value as
956.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
957the same value as .suspend() (and .freeze() and .poweroff()).
958
959The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
960dev_pm_ops to indicate that one suspend routine is to be pointed to by the
961.suspend(), .freeze(), and .poweroff() members and one resume routine is to
962be pointed to by the .resume(), .thaw(), and .restore() members.
963
9643.2. Device Runtime Power Management
965------------------------------------
966In addition to providing device power management callbacks PCI device drivers
967are responsible for controlling the runtime power management (runtime PM) of
968their devices.
969
970The PCI device runtime PM is optional, but it is recommended that PCI device
971drivers implement it at least in the cases where there is a reliable way of
972verifying that the device is not used (like when the network cable is detached
973from an Ethernet adapter or there are no devices attached to a USB controller).
974
975To support the PCI runtime PM the driver first needs to implement the
976runtime_suspend() and runtime_resume() callbacks. It also may need to implement
977the runtime_idle() callback to prevent the device from being suspended again
978every time right after the runtime_resume() callback has returned
979(alternatively, the runtime_suspend() callback will have to check if the
980device should really be suspended and return -EAGAIN if that is not the case).
981
982The runtime PM of PCI devices is disabled by default. It is also blocked by
983pci_pm_init() that runs the pm_runtime_forbid() helper function. If a PCI
984driver implements the runtime PM callbacks and intends to use the runtime PM
985framework provided by the PM core and the PCI subsystem, it should enable this
986feature by executing the pm_runtime_enable() helper function. However, the
987driver should not call the pm_runtime_allow() helper function unblocking
988the runtime PM of the device. Instead, it should allow user space or some
989platform-specific code to do that (user space can do it via sysfs), although
990once it has called pm_runtime_enable(), it must be prepared to handle the
991runtime PM of the device correctly as soon as pm_runtime_allow() is called
992(which may happen at any time). [It also is possible that user space causes
993pm_runtime_allow() to be called via sysfs before the driver is loaded, so in
994fact the driver has to be prepared to handle the runtime PM of the device as
995soon as it calls pm_runtime_enable().]
996
997The runtime PM framework works by processing requests to suspend or resume
998devices, or to check if they are idle (in which cases it is reasonable to
999subsequently request that they be suspended). These requests are represented
1000by work items put into the power management workqueue, pm_wq. Although there
1001are a few situations in which power management requests are automatically
1002queued by the PM core (for example, after processing a request to resume a
1003device the PM core automatically queues a request to check if the device is
1004idle), device drivers are generally responsible for queuing power management
1005requests for their devices. For this purpose they should use the runtime PM
1006helper functions provided by the PM core, discussed in
1007Documentation/power/runtime_pm.txt.
1008
1009Devices can also be suspended and resumed synchronously, without placing a
1010request into pm_wq. In the majority of cases this also is done by their
1011drivers that use helper functions provided by the PM core for this purpose.
1012
1013For more information on the runtime PM of devices refer to
1014Documentation/power/runtime_pm.txt.
1015
1016
10174. Resources
1018============
1019
1020PCI Local Bus Specification, Rev. 3.0
1021PCI Bus Power Management Interface Specification, Rev. 1.2
1022Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
1023PCI Express Base Specification, Rev. 2.0
1024Documentation/power/devices.txt
1025Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index c40866e8b957..bfed898a03fc 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters
18being runtime configurable or changeable from a driver was seen as too easy to 18being runtime configurable or changeable from a driver was seen as too easy to
19abuse. 19abuse.
20 20
21For each parameter a list of performance requirements is maintained along with 21For each parameter a list of performance requests is maintained along with
22an aggregated target value. The aggregated target value is updated with 22an aggregated target value. The aggregated target value is updated with
23changes to the requirement list or elements of the list. Typically the 23changes to the request list or elements of the list. Typically the
24aggregated target value is simply the max or min of the requirement values held 24aggregated target value is simply the max or min of the request values held
25in the parameter list elements. 25in the parameter list elements.
26 26
27From kernel mode the use of this interface is simple: 27From kernel mode the use of this interface is simple:
28pm_qos_add_requirement(param_id, name, target_value):
29Will insert a named element in the list for that identified PM_QOS parameter
30with the target value. Upon change to this list the new target is recomputed
31and any registered notifiers are called only if the target value is now
32different.
33 28
34pm_qos_update_requirement(param_id, name, new_target_value): 29handle = pm_qos_add_request(param_class, target_value):
35Will search the list identified by the param_id for the named list element and 30Will insert an element into the list for that identified PM_QOS class with the
36then update its target value, calling the notification tree if the aggregated 31target value. Upon change to this list the new target is recomputed and any
37target is changed. with that name is already registered. 32registered notifiers are called only if the target value is now different.
33Clients of pm_qos need to save the returned handle.
38 34
39pm_qos_remove_requirement(param_id, name): 35void pm_qos_update_request(handle, new_target_value):
40Will search the identified list for the named element and remove it, after 36Will update the list element pointed to by the handle with the new target value
41removal it will update the aggregate target and call the notification tree if 37and recompute the new aggregated target, calling the notification tree if the
42the target was changed as a result of removing the named requirement. 38target is changed.
39
40void pm_qos_remove_request(handle):
41Will remove the element. After removal it will update the aggregate target and
42call the notification tree if the target was changed as a result of removing
43the request.
43 44
44 45
45From user mode: 46From user mode:
46Only processes can register a pm_qos requirement. To provide for automatic 47Only processes can register a pm_qos request. To provide for automatic
47cleanup for process the interface requires the process to register its 48cleanup of a process, the interface requires the process to register its
48parameter requirements in the following way: 49parameter requests in the following way:
49 50
50To register the default pm_qos target for the specific parameter, the process 51To register the default pm_qos target for the specific parameter, the process
51must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] 52must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
52 53
53As long as the device node is held open that process has a registered 54As long as the device node is held open that process has a registered
54requirement on the parameter. The name of the requirement is "process_<PID>" 55request on the parameter.
55derived from the current->pid from within the open system call.
56 56
57To change the requested target value the process needs to write a s32 value to 57To change the requested target value the process needs to write an s32 value to
58the open device node. This translates to a pm_qos_update_requirement call. 58the open device node. Alternatively the user mode program could write a hex
59string for the value using 10 char long format e.g. "0x12345678". This
60translates to a pm_qos_update_request call.
59 61
60To remove the user mode request for a target value simply close the device 62To remove the user mode request for a target value simply close the device
61node. 63node.
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index cdebb5145c25..55c4175d8099 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -8,11 +8,11 @@ Please see overview.txt for a description of the terms used in this text.
81. Consumer Regulator Access (static & dynamic drivers) 81. Consumer Regulator Access (static & dynamic drivers)
9======================================================= 9=======================================================
10 10
11A consumer driver can get access to it's supply regulator by calling :- 11A consumer driver can get access to its supply regulator by calling :-
12 12
13regulator = regulator_get(dev, "Vcc"); 13regulator = regulator_get(dev, "Vcc");
14 14
15The consumer passes in it's struct device pointer and power supply ID. The core 15The consumer passes in its struct device pointer and power supply ID. The core
16then finds the correct regulator by consulting a machine specific lookup table. 16then finds the correct regulator by consulting a machine specific lookup table.
17If the lookup is successful then this call will return a pointer to the struct 17If the lookup is successful then this call will return a pointer to the struct
18regulator that supplies this consumer. 18regulator that supplies this consumer.
@@ -34,7 +34,7 @@ usually be called in your device drivers probe() and remove() respectively.
342. Regulator Output Enable & Disable (static & dynamic drivers) 342. Regulator Output Enable & Disable (static & dynamic drivers)
35==================================================================== 35====================================================================
36 36
37A consumer can enable it's power supply by calling:- 37A consumer can enable its power supply by calling:-
38 38
39int regulator_enable(regulator); 39int regulator_enable(regulator);
40 40
@@ -49,7 +49,7 @@ int regulator_is_enabled(regulator);
49This will return > zero when the regulator is enabled. 49This will return > zero when the regulator is enabled.
50 50
51 51
52A consumer can disable it's supply when no longer needed by calling :- 52A consumer can disable its supply when no longer needed by calling :-
53 53
54int regulator_disable(regulator); 54int regulator_disable(regulator);
55 55
@@ -140,7 +140,7 @@ by calling :-
140int regulator_set_optimum_mode(struct regulator *regulator, int load_uA); 140int regulator_set_optimum_mode(struct regulator *regulator, int load_uA);
141 141
142This will cause the core to recalculate the total load on the regulator (based 142This will cause the core to recalculate the total load on the regulator (based
143on all it's consumers) and change operating mode (if necessary and permitted) 143on all its consumers) and change operating mode (if necessary and permitted)
144to best match the current operating load. 144to best match the current operating load.
145 145
146The load_uA value can be determined from the consumers datasheet. e.g.most 146The load_uA value can be determined from the consumers datasheet. e.g.most
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index 63728fed620b..bdec39b9bd75 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt
@@ -52,7 +52,7 @@ static struct regulator_init_data regulator1_data = {
52}; 52};
53 53
54Regulator-1 supplies power to Regulator-2. This relationship must be registered 54Regulator-1 supplies power to Regulator-2. This relationship must be registered
55with the core so that Regulator-1 is also enabled when Consumer A enables it's 55with the core so that Regulator-1 is also enabled when Consumer A enables its
56supply (Regulator-2). The supply regulator is set by the supply_regulator_dev 56supply (Regulator-2). The supply regulator is set by the supply_regulator_dev
57field below:- 57field below:-
58 58
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index ffd185bb6054..9363e056188a 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -35,16 +35,16 @@ Some terms used in this document:-
35 o Consumer - Electronic device that is supplied power by a regulator. 35 o Consumer - Electronic device that is supplied power by a regulator.
36 Consumers can be classified into two types:- 36 Consumers can be classified into two types:-
37 37
38 Static: consumer does not change it's supply voltage or 38 Static: consumer does not change its supply voltage or
39 current limit. It only needs to enable or disable it's 39 current limit. It only needs to enable or disable it's
40 power supply. It's supply voltage is set by the hardware, 40 power supply. Its supply voltage is set by the hardware,
41 bootloader, firmware or kernel board initialisation code. 41 bootloader, firmware or kernel board initialisation code.
42 42
43 Dynamic: consumer needs to change it's supply voltage or 43 Dynamic: consumer needs to change it's supply voltage or
44 current limit to meet operation demands. 44 current limit to meet operation demands.
45 45
46 46
47 o Power Domain - Electronic circuit that is supplied it's input power by the 47 o Power Domain - Electronic circuit that is supplied its input power by the
48 output power of a regulator, switch or by another power 48 output power of a regulator, switch or by another power
49 domain. 49 domain.
50 50
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index b967cd9137d6..81680f9f5909 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous
24reading and writing. It is also impossible to have the device open more than 24reading and writing. It is also impossible to have the device open more than
25once at a time. 25once at a time.
26 26
27Even opening the device has side effects. Data structures are
28allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
29called.
30
27The ioctl() commands recognized by the device are: 31The ioctl() commands recognized by the device are:
28 32
29SNAPSHOT_FREEZE - freeze user space processes (the current process is 33SNAPSHOT_FREEZE - freeze user space processes (the current process is
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 79f533f38c61..46d22105aa07 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1289,7 +1289,7 @@ link between a device node and its interrupt parent in
1289the interrupt tree. The value of interrupt-parent is the 1289the interrupt tree. The value of interrupt-parent is the
1290phandle of the parent node. 1290phandle of the parent node.
1291 1291
1292If the interrupt-parent property is not defined for a node, it's 1292If the interrupt-parent property is not defined for a node, its
1293interrupt parent is assumed to be an ancestor in the node's 1293interrupt parent is assumed to be an ancestor in the node's
1294_device tree_ hierarchy. 1294_device tree_ hierarchy.
1295 1295
diff --git a/Documentation/powerpc/dts-bindings/4xx/reboot.txt b/Documentation/powerpc/dts-bindings/4xx/reboot.txt
new file mode 100644
index 000000000000..d7217260589c
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/4xx/reboot.txt
@@ -0,0 +1,18 @@
1Reboot property to control system reboot on PPC4xx systems:
2
3By setting "reset_type" to one of the following values, the default
4software reset mechanism may be overidden. Here the possible values of
5"reset_type":
6
7 1 - PPC4xx core reset
8 2 - PPC4xx chip reset
9 3 - PPC4xx system reset (default)
10
11Example:
12
13 cpu@0 {
14 device_type = "cpu";
15 model = "PowerPC,440SPe";
16 ...
17 reset-type = <2>; /* Use chip-reset */
18 };
diff --git a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
index d015dcec4011..b0019eb5330e 100644
--- a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
@@ -11,7 +11,7 @@ Required properties:
11 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx. 11 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx.
12- #gpio-cells : Should be two. The first cell is the pin number and the 12- #gpio-cells : Should be two. The first cell is the pin number and the
13 second cell is used to specify optional parameters (currently unused). 13 second cell is used to specify optional parameters (currently unused).
14 - interrupts : Interrupt mapping for GPIO IRQ (currently unused). 14 - interrupts : Interrupt mapping for GPIO IRQ.
15 - interrupt-parent : Phandle for the interrupt controller that 15 - interrupt-parent : Phandle for the interrupt controller that
16 services interrupts for this device. 16 services interrupts for this device.
17- gpio-controller : Marks the port as GPIO controller. 17- gpio-controller : Marks the port as GPIO controller.
@@ -38,3 +38,23 @@ Example of gpio-controller nodes for a MPC8347 SoC:
38 38
39See booting-without-of.txt for details of how to specify GPIO 39See booting-without-of.txt for details of how to specify GPIO
40information for devices. 40information for devices.
41
42To use GPIO pins as interrupt sources for peripherals, specify the
43GPIO controller as the interrupt parent and define GPIO number +
44trigger mode using the interrupts property, which is defined like
45this:
46
47interrupts = <number trigger>, where:
48 - number: GPIO pin (0..31)
49 - trigger: trigger mode:
50 2 = trigger on falling edge
51 3 = trigger on both edges
52
53Example of device using this is:
54
55 funkyfpga@0 {
56 compatible = "funky-fpga";
57 ...
58 interrupts = <4 3>;
59 interrupt-parent = <&gpio1>;
60 };
diff --git a/Documentation/powerpc/dts-bindings/xilinx.txt b/Documentation/powerpc/dts-bindings/xilinx.txt
index ea68046bb9cb..299d0923537b 100644
--- a/Documentation/powerpc/dts-bindings/xilinx.txt
+++ b/Documentation/powerpc/dts-bindings/xilinx.txt
@@ -11,7 +11,7 @@
11 control how the core is synthesized. Historically, the EDK tool would 11 control how the core is synthesized. Historically, the EDK tool would
12 extract the device parameters relevant to device drivers and copy them 12 extract the device parameters relevant to device drivers and copy them
13 into an 'xparameters.h' in the form of #define symbols. This tells the 13 into an 'xparameters.h' in the form of #define symbols. This tells the
14 device drivers how the IP cores are configured, but it requres the kernel 14 device drivers how the IP cores are configured, but it requires the kernel
15 to be recompiled every time the FPGA bitstream is resynthesized. 15 to be recompiled every time the FPGA bitstream is resynthesized.
16 16
17 The new approach is to export the parameters into the device tree and 17 The new approach is to export the parameters into the device tree and
diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt
index c4682b982a2e..ad340205d96a 100644
--- a/Documentation/powerpc/phyp-assisted-dump.txt
+++ b/Documentation/powerpc/phyp-assisted-dump.txt
@@ -19,7 +19,7 @@ dump offers several strong, practical advantages:
19 immediately available to the system for normal use. 19 immediately available to the system for normal use.
20-- After the dump is completed, no further reboots are 20-- After the dump is completed, no further reboots are
21 required; the system will be fully usable, and running 21 required; the system will be fully usable, and running
22 in it's normal, production mode on it normal kernel. 22 in its normal, production mode on its normal kernel.
23 23
24The above can only be accomplished by coordination with, 24The above can only be accomplished by coordination with,
25and assistance from the hypervisor. The procedure is 25and assistance from the hypervisor. The procedure is
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index aae8355d3166..221f38be98f4 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -190,3 +190,61 @@ Example:
190 for (node = rb_first(&mytree); node; node = rb_next(node)) 190 for (node = rb_first(&mytree); node; node = rb_next(node))
191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); 191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 192
193Support for Augmented rbtrees
194-----------------------------
195
196Augmented rbtree is an rbtree with "some" additional data stored in each node.
197This data can be used to augment some new functionality to rbtree.
198Augmented rbtree is an optional feature built on top of basic rbtree
199infrastructure. rbtree user who wants this feature will have an augment
200callback function in rb_root initialized.
201
202This callback function will be called from rbtree core routines whenever
203a node has a change in one or both of its children. It is the responsibility
204of the callback function to recalculate the additional data that is in the
205rb node using new children information. Note that if this new additional
206data affects the parent node's additional data, then callback function has
207to handle it and do the recursive updates.
208
209
210Interval tree is an example of augmented rb tree. Reference -
211"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
212More details about interval trees:
213
214Classical rbtree has a single key and it cannot be directly used to store
215interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
216lo:hi or to find whether there is an exact match for a new lo:hi.
217
218However, rbtree can be augmented to store such interval ranges in a structured
219way making it possible to do efficient lookup and exact match.
220
221This "extra information" stored in each node is the maximum hi
222(max_hi) value among all the nodes that are its descendents. This
223information can be maintained at each node just be looking at the node
224and its immediate children. And this will be used in O(log n) lookup
225for lowest match (lowest start address among all possible matches)
226with something like:
227
228find_lowest_match(lo, hi, node)
229{
230 lowest_match = NULL;
231 while (node) {
232 if (max_hi(node->left) > lo) {
233 // Lowest overlap if any must be on left side
234 node = node->left;
235 } else if (overlap(lo, hi, node)) {
236 lowest_match = node;
237 break;
238 } else if (lo > node->lo) {
239 // Lowest overlap if any must be on right side
240 node = node->right;
241 } else {
242 break;
243 }
244 }
245 return lowest_match;
246}
247
248Finding exact match will be to first find lowest match and then to follow
249successor nodes looking for exact match, until the start of a node is beyond
250the hi value we are looking for.
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index b4860509c319..83668e5dd17f 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -99,37 +99,15 @@ system. Also, it is possible to switch all rfkill drivers (or all drivers of
99a specified type) into a state which also updates the default state for 99a specified type) into a state which also updates the default state for
100hotplugged devices. 100hotplugged devices.
101 101
102After an application opens /dev/rfkill, it can read the current state of 102After an application opens /dev/rfkill, it can read the current state of all
103all devices, and afterwards can poll the descriptor for hotplug or state 103devices. Changes can be either obtained by either polling the descriptor for
104change events. 104hotplug or state change events or by listening for uevents emitted by the
105 105rfkill core framework.
106Applications must ignore operations (the "op" field) they do not handle, 106
107this allows the API to be extended in the future. 107Additionally, each rfkill device is registered in sysfs and emits uevents.
108 108
109Additionally, each rfkill device is registered in sysfs and there has the 109rfkill devices issue uevents (with an action of "change"), with the following
110following attributes: 110environment variables set:
111
112 name: Name assigned by driver to this key (interface or driver name).
113 type: Driver type string ("wlan", "bluetooth", etc).
114 persistent: Whether the soft blocked state is initialised from
115 non-volatile storage at startup.
116 state: Current state of the transmitter
117 0: RFKILL_STATE_SOFT_BLOCKED
118 transmitter is turned off by software
119 1: RFKILL_STATE_UNBLOCKED
120 transmitter is (potentially) active
121 2: RFKILL_STATE_HARD_BLOCKED
122 transmitter is forced off by something outside of
123 the driver's control.
124 This file is deprecated because it can only properly show
125 three of the four possible states, soft-and-hard-blocked is
126 missing.
127 claim: 0: Kernel handles events
128 This file is deprecated because there no longer is a way to
129 claim just control over a single rfkill instance.
130
131rfkill devices also issue uevents (with an action of "change"), with the
132following environment variables set:
133 111
134RFKILL_NAME 112RFKILL_NAME
135RFKILL_STATE 113RFKILL_STATE
@@ -137,3 +115,7 @@ RFKILL_TYPE
137 115
138The contents of these variables corresponds to the "name", "state" and 116The contents of these variables corresponds to the "name", "state" and
139"type" sysfs files explained above. 117"type" sysfs files explained above.
118
119
120For further details consult Documentation/ABI/stable/dev-rfkill and
121Documentation/ABI/stable/sysfs-class-rfkill.
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
index 4b736d24da7a..8df0b782c4d7 100644
--- a/Documentation/rt-mutex-design.txt
+++ b/Documentation/rt-mutex-design.txt
@@ -657,7 +657,7 @@ here.
657 657
658The waiter structure has a "task" field that points to the task that is blocked 658The waiter structure has a "task" field that points to the task that is blocked
659on the mutex. This field can be NULL the first time it goes through the loop 659on the mutex. This field can be NULL the first time it goes through the loop
660or if the task is a pending owner and had it's mutex stolen. If the "task" 660or if the task is a pending owner and had its mutex stolen. If the "task"
661field is NULL then we need to set up the accounting for it. 661field is NULL then we need to set up the accounting for it.
662 662
663Task blocks on mutex 663Task blocks on mutex
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e2..8239ebbcddce 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be
211desirable to first provide fair CPU time to each user on the system and then to 211desirable to first provide fair CPU time to each user on the system and then to
212each task belonging to a user. 212each task belonging to a user.
213 213
214CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be 214CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be
215grouped and divides CPU time fairly among such groups. 215grouped and divides CPU time fairly among such groups.
216 216
217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and 217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and 220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
221SCHED_BATCH) tasks. 221SCHED_BATCH) tasks.
222 222
223At present, there are two (mutually exclusive) mechanisms to group tasks for 223 These options need CONFIG_CGROUPS to be defined, and let the administrator
224CPU bandwidth control purposes:
225
226 - Based on user id (CONFIG_USER_SCHED)
227
228 With this option, tasks are grouped according to their user id.
229
230 - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
231
232 This options needs CONFIG_CGROUPS to be defined, and lets the administrator
233 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See 224 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
234 Documentation/cgroups/cgroups.txt for more information about this filesystem. 225 Documentation/cgroups/cgroups.txt for more information about this filesystem.
235 226
236Only one of these options to group tasks can be chosen and not both. 227When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
237
238When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
239user and a "cpu_share" file is added in that directory.
240
241 # cd /sys/kernel/uids
242 # cat 512/cpu_share # Display user 512's CPU share
243 1024
244 # echo 2048 > 512/cpu_share # Modify user 512's CPU share
245 # cat 512/cpu_share # Display user 512's CPU share
246 2048
247 #
248
249CPU bandwidth between two users is divided in the ratio of their CPU shares.
250For example: if you would like user "root" to get twice the bandwidth of user
251"guest," then set the cpu_share for both the users such that "root"'s cpu_share
252is twice "guest"'s cpu_share.
253
254When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
255group created using the pseudo filesystem. See example steps below to create 228group created using the pseudo filesystem. See example steps below to create
256task groups and modify their CPU share using the "cgroups" pseudo filesystem. 229task groups and modify their CPU share using the "cgroups" pseudo filesystem.
257 230
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
273 246
274 # #Launch gmplayer (or your favourite movie player) 247 # #Launch gmplayer (or your favourite movie player)
275 # echo <movie_player_pid> > multimedia/tasks 248 # echo <movie_player_pid> > multimedia/tasks
276
2778. Implementation note: user namespaces
278
279User namespaces are intended to be hierarchical. But they are currently
280only partially implemented. Each of those has ramifications for CFS.
281
282First, since user namespaces are hierarchical, the /sys/kernel/uids
283presentation is inadequate. Eventually we will likely want to use sysfs
284tagging to provide private views of /sys/kernel/uids within each user
285namespace.
286
287Second, the hierarchical nature is intended to support completely
288unprivileged use of user namespaces. So if using user groups, then
289we want the users in a user namespace to be children of the user
290who created it.
291
292That is currently unimplemented. So instead, every user in a new
293user namespace will receive 1024 shares just like any user in the
294initial user namespace. Note that at the moment creation of a new
295user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
296CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c3419..605b0d40329d 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
1262.3 Basis for grouping tasks 1262.3 Basis for grouping tasks
127---------------------------- 127----------------------------
128 128
129There are two compile-time settings for allocating CPU bandwidth. These are 129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
130configured using the "Basis for grouping tasks" multiple choice menu under 130CPU bandwidth to task groups.
131General setup > Group CPU Scheduler:
132
133a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
134
135This lets you use the virtual files under
136"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
137each user .
138
139The other option is:
140
141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
142 131
143This uses the /cgroup virtual file system and 132This uses the /cgroup virtual file system and
144"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each 133"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
145control group instead. 134control group.
146 135
147For more information on working with control groups, you should read 136For more information on working with control groups, you should read
148Documentation/cgroups/cgroups.txt as well. 137Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
161=============== 150===============
162 151
163There is work in progress to make the scheduling period for each group 152There is work in progress to make the scheduling period for each group
164("/sys/kernel/uids/<uid>/cpu_rt_period_us" or 153("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
165"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
166 154
167The constraint on the period is that a subgroup must have a smaller or 155The constraint on the period is that a subgroup must have a smaller or
168equal period to its parent. But realistically its not very useful _yet_ 156equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc
index 2ffc1148eb95..e759e92e286d 100644
--- a/Documentation/scsi/ChangeLog.lpfc
+++ b/Documentation/scsi/ChangeLog.lpfc
@@ -707,7 +707,7 @@ Changes from 20040920 to 20041018
707 * Integrate patches from Christoph Hellwig: two new helpers common 707 * Integrate patches from Christoph Hellwig: two new helpers common
708 to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant 708 to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant
709 cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is 709 cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is
710 gone - lpfc_sli_issue_iocb_wait loses it's flags argument 710 gone - lpfc_sli_issue_iocb_wait loses its flags argument
711 totally. 711 totally.
712 * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit 712 * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit
713 field. 713 field.
@@ -1028,7 +1028,7 @@ Changes from 20040614 to 20040709
1028 * Remove the need for buf_tmo. 1028 * Remove the need for buf_tmo.
1029 * Changed ULP_BDE64 to struct ulp_bde64. 1029 * Changed ULP_BDE64 to struct ulp_bde64.
1030 * Changed ULP_BDE to struct ulp_bde. 1030 * Changed ULP_BDE to struct ulp_bde.
1031 * Cleanup lpfc_os_return_scsi_cmd() and it's call path. 1031 * Cleanup lpfc_os_return_scsi_cmd() and its call path.
1032 * Removed lpfc_no_device_delay. 1032 * Removed lpfc_no_device_delay.
1033 * Consolidating lpfc_hba_put_event() into lpfc_put_event(). 1033 * Consolidating lpfc_hba_put_event() into lpfc_put_event().
1034 * Removed following attributes and their functionality: 1034 * Removed following attributes and their functionality:
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt
index d5acaa300a46..1540a92f6d2b 100644
--- a/Documentation/scsi/FlashPoint.txt
+++ b/Documentation/scsi/FlashPoint.txt
@@ -71,7 +71,7 @@ peters@mylex.com
71 71
72Ever since its introduction last October, the BusLogic FlashPoint LT has 72Ever since its introduction last October, the BusLogic FlashPoint LT has
73been problematic for members of the Linux community, in that no Linux 73been problematic for members of the Linux community, in that no Linux
74drivers have been available for this new Ultra SCSI product. Despite it's 74drivers have been available for this new Ultra SCSI product. Despite its
75officially being positioned as a desktop workstation product, and not being 75officially being positioned as a desktop workstation product, and not being
76particularly well suited for a high performance multitasking operating 76particularly well suited for a high performance multitasking operating
77system like Linux, the FlashPoint LT has been touted by computer system 77system like Linux, the FlashPoint LT has been touted by computer system
diff --git a/Documentation/scsi/dtc3x80.txt b/Documentation/scsi/dtc3x80.txt
index e8ae6230ab3e..1d7af9f9a8ed 100644
--- a/Documentation/scsi/dtc3x80.txt
+++ b/Documentation/scsi/dtc3x80.txt
@@ -12,7 +12,7 @@ The 3180 does not. Otherwise, they are identical.
12The DTC3x80 does not support DMA but it does have Pseudo-DMA which is 12The DTC3x80 does not support DMA but it does have Pseudo-DMA which is
13supported by the driver. 13supported by the driver.
14 14
15It's DTC406 scsi chip is supposedly compatible with the NCR 53C400. 15Its DTC406 scsi chip is supposedly compatible with the NCR 53C400.
16It is memory mapped, uses an IRQ, but no dma or io-port. There is 16It is memory mapped, uses an IRQ, but no dma or io-port. There is
17internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double 17internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double
18buffering is done automagically by the chip. Data is transferred 18buffering is done automagically by the chip. Data is transferred
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 08e2b4d04aab..cda5f8fa2c66 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -1479,7 +1479,7 @@ Wide16 SCSI.
1479Enabling serial NVRAM support enables detection of the serial NVRAM included 1479Enabling serial NVRAM support enables detection of the serial NVRAM included
1480on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 1480on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
1481serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 1481serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
1482host adaptor and it's attached drives. 1482host adaptor and its attached drives.
1483 1483
1484The Symbios NVRAM also holds data on the boot order of host adaptors in a 1484The Symbios NVRAM also holds data on the boot order of host adaptors in a
1485system with more than one host adaptor. This enables the order of scanning 1485system with more than one host adaptor. This enables the order of scanning
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
index f536907e241d..2b21890bc983 100644
--- a/Documentation/scsi/osst.txt
+++ b/Documentation/scsi/osst.txt
@@ -40,7 +40,7 @@ behavior looks very much the same as st to the userspace applications.
40 40
41History 41History
42------- 42-------
43In the first place, osst shared it's identity very much with st. That meant 43In the first place, osst shared its identity very much with st. That meant
44that it used the same kernel structures and the same device node as st. 44that it used the same kernel structures and the same device node as st.
45So you could only have either of them being present in the kernel. This has 45So you could only have either of them being present in the kernel. This has
46been fixed by registering an own device, now. 46been fixed by registering an own device, now.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index aec6549ab097..e00192de4d1c 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -70,7 +70,7 @@ Overview:
70 up to an administrative entity controlling the vport. For example, 70 up to an administrative entity controlling the vport. For example,
71 if vports are to be associated with virtual machines, a XEN mgmt 71 if vports are to be associated with virtual machines, a XEN mgmt
72 utility would be responsible for creating wwpn/wwnn's for the vport, 72 utility would be responsible for creating wwpn/wwnn's for the vport,
73 using it's own naming authority and OUI. (Note: it already does this 73 using its own naming authority and OUI. (Note: it already does this
74 for virtual MAC addresses). 74 for virtual MAC addresses).
75 75
76 76
@@ -81,7 +81,7 @@ Device Trees and Vport Objects:
81 with rports and scsi target objects underneath it. Currently the FC 81 with rports and scsi target objects underneath it. Currently the FC
82 transport creates the vport object and places it under the scsi_host 82 transport creates the vport object and places it under the scsi_host
83 object corresponding to the physical adapter. The LLDD will allocate 83 object corresponding to the physical adapter. The LLDD will allocate
84 a new scsi_host for the vport and link it's object under the vport. 84 a new scsi_host for the vport and link its object under the vport.
85 The remainder of the tree under the vports scsi_host is the same 85 The remainder of the tree under the vports scsi_host is the same
86 as the non-NPIV case. The transport is written currently to easily 86 as the non-NPIV case. The transport is written currently to easily
87 allow the parent of the vport to be something other than the scsi_host. 87 allow the parent of the vport to be something other than the scsi_host.
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index eb9a7b905b64..6f63b7989679 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -687,7 +687,7 @@ maintain the driver code.
687Enabling serial NVRAM support enables detection of the serial NVRAM included 687Enabling serial NVRAM support enables detection of the serial NVRAM included
688on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 688on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
689serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 689serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
690host adaptor and it's attached drives. 690host adaptor and its attached drives.
691 691
692The Symbios NVRAM also holds data on the boot order of host adaptors in a 692The Symbios NVRAM also holds data on the boot order of host adaptors in a
693system with more than one host adaptor. This information is no longer used 693system with more than one host adaptor. This information is no longer used
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index bfcbbf88c44d..2075bbb8b3e2 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -227,6 +227,16 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
227 227
228 The power-management is supported. 228 The power-management is supported.
229 229
230 Module snd-asihpi
231 -----------------
232
233 Module for AudioScience ASI soundcards
234
235 enable_hpi_hwdep - enable HPI hwdep for AudioScience soundcard
236
237 This module supports multiple cards.
238 The driver requires the firmware loader support on kernel.
239
230 Module snd-atiixp 240 Module snd-atiixp
231 ----------------- 241 -----------------
232 242
@@ -622,28 +632,23 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
622 632
623 The power-management is supported. 633 The power-management is supported.
624 634
625 Module snd-es968
626 ----------------
627
628 Module for sound cards based on ESS ES968 chip (PnP only).
629
630 This module supports multiple cards, PnP and autoprobe.
631
632 The power-management is supported.
633
634 Module snd-es1688 635 Module snd-es1688
635 ----------------- 636 -----------------
636 637
637 Module for ESS AudioDrive ES-1688 and ES-688 sound cards. 638 Module for ESS AudioDrive ES-1688 and ES-688 sound cards.
638 639
639 port - port # for ES-1688 chip (0x220,0x240,0x260) 640 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
640 fm_port - port # for OPL3 (option; share the same port as default)
641 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default) 641 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default)
642 irq - IRQ # for ES-1688 chip (5,7,9,10)
643 mpu_irq - IRQ # for MPU-401 port (5,7,9,10) 642 mpu_irq - IRQ # for MPU-401 port (5,7,9,10)
643 fm_port - port # for OPL3 (option; share the same port as default)
644
645 with isapnp=0, the following additional options are available:
646 port - port # for ES-1688 chip (0x220,0x240,0x260)
647 irq - IRQ # for ES-1688 chip (5,7,9,10)
644 dma8 - DMA # for ES-1688 chip (0,1,3) 648 dma8 - DMA # for ES-1688 chip (0,1,3)
645 649
646 This module supports multiple cards and autoprobe (without MPU-401 port). 650 This module supports multiple cards and autoprobe (without MPU-401 port)
651 and PnP with the ES968 chip.
647 652
648 Module snd-es18xx 653 Module snd-es18xx
649 ----------------- 654 -----------------
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 98d14cb8a85d..bdafdbd32561 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -204,7 +204,6 @@ generic parser regardless of the codec. Usually the codec-specific
204parser is much better than the generic parser (as now). Thus this 204parser is much better than the generic parser (as now). Thus this
205option is more about the debugging purpose. 205option is more about the debugging purpose.
206 206
207
208Speaker and Headphone Output 207Speaker and Headphone Output
209~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 208~~~~~~~~~~~~~~~~~~~~~~~~~~~~
210One of the most frequent (and obvious) bugs with HD-audio is the 209One of the most frequent (and obvious) bugs with HD-audio is the
@@ -600,6 +599,9 @@ probing, the proc file is available, so you can get the raw codec
600information before modified by the driver. Of course, the driver 599information before modified by the driver. Of course, the driver
601isn't usable with `probe_only=1`. But you can continue the 600isn't usable with `probe_only=1`. But you can continue the
602configuration via hwdep sysfs file if hda-reconfig option is enabled. 601configuration via hwdep sysfs file if hda-reconfig option is enabled.
602Using `probe_only` mask 2 skips the reset of HDA codecs (use
603`probe_only=3` as module option). The hwdep interface can be used
604to determine the BIOS codec initialization.
603 605
604 606
605hda-verb 607hda-verb
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 9ac842be9b4f..05bf5a0eee41 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -188,8 +188,8 @@ The WM8731 output mixer has 3 inputs (sources)
188 3. Mic Sidetone Input 188 3. Mic Sidetone Input
189 189
190Each input in this example has a kcontrol associated with it (defined in example 190Each input in this example has a kcontrol associated with it (defined in example
191above) and is connected to the output mixer via it's kcontrol name. We can now 191above) and is connected to the output mixer via its kcontrol name. We can now
192connect the destination widget (wrt audio signal) with it's source widgets. 192connect the destination widget (wrt audio signal) with its source widgets.
193 193
194 /* output mixer */ 194 /* output mixer */
195 {"Output Mixer", "Line Bypass Switch", "Line Input"}, 195 {"Output Mixer", "Line Bypass Switch", "Line Input"},
diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt
index bab7711ce963..2524c75557df 100644
--- a/Documentation/sound/alsa/soc/machine.txt
+++ b/Documentation/sound/alsa/soc/machine.txt
@@ -67,7 +67,7 @@ static struct snd_soc_dai_link corgi_dai = {
67 .ops = &corgi_ops, 67 .ops = &corgi_ops,
68}; 68};
69 69
70struct snd_soc_card then sets up the machine with it's DAIs. e.g. 70struct snd_soc_card then sets up the machine with its DAIs. e.g.
71 71
72/* corgi audio machine driver */ 72/* corgi audio machine driver */
73static struct snd_soc_card snd_soc_corgi = { 73static struct snd_soc_card snd_soc_corgi = {
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 1e4c6d3655f2..138ac88c1461 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -33,7 +33,7 @@ features :-
33 and machines. 33 and machines.
34 34
35 * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC 35 * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC
36 interface and codec registers it's audio interface capabilities with the 36 interface and codec registers its audio interface capabilities with the
37 core and are subsequently matched and configured when the application 37 core and are subsequently matched and configured when the application
38 hardware parameters are known. 38 hardware parameters are known.
39 39
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 34c76a55bc04..9b659c79a547 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -54,12 +54,12 @@ Getting sparse
54~~~~~~~~~~~~~~ 54~~~~~~~~~~~~~~
55 55
56You can get latest released versions from the Sparse homepage at 56You can get latest released versions from the Sparse homepage at
57http://www.kernel.org/pub/linux/kernel/people/josh/sparse/ 57https://sparse.wiki.kernel.org/index.php/Main_Page
58 58
59Alternatively, you can get snapshots of the latest development version 59Alternatively, you can get snapshots of the latest development version
60of sparse using git to clone.. 60of sparse using git to clone..
61 61
62 git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git 62 git://git.kernel.org/pub/scm/devel/sparse/sparse.git
63 63
64DaveJ has hourly generated tarballs of the git tree available at.. 64DaveJ has hourly generated tarballs of the git tree available at..
65 65
diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi
new file mode 100644
index 000000000000..6325f5b48635
--- /dev/null
+++ b/Documentation/spi/ep93xx_spi
@@ -0,0 +1,95 @@
1Cirrus EP93xx SPI controller driver HOWTO
2=========================================
3
4ep93xx_spi driver brings SPI master support for EP93xx SPI controller. Chip
5selects are implemented with GPIO lines.
6
7NOTE: If possible, don't use SFRMOUT (SFRM1) signal as a chip select. It will
8not work correctly (it cannot be controlled by software). Use GPIO lines
9instead.
10
11Sample configuration
12====================
13
14Typically driver configuration is done in platform board files (the files under
15arch/arm/mach-ep93xx/*.c). In this example we configure MMC over SPI through
16this driver on TS-7260 board. You can adapt the code to suit your needs.
17
18This example uses EGPIO9 as SD/MMC card chip select (this is wired in DIO1
19header on the board).
20
21You need to select CONFIG_MMC_SPI to use mmc_spi driver.
22
23arch/arm/mach-ep93xx/ts72xx.c:
24
25...
26#include <linux/gpio.h>
27#include <linux/spi/spi.h>
28
29#include <mach/ep93xx_spi.h>
30
31/* this is our GPIO line used for chip select */
32#define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO9
33
34static int ts72xx_mmc_spi_setup(struct spi_device *spi)
35{
36 int err;
37
38 err = gpio_request(MMC_CHIP_SELECT_GPIO, spi->modalias);
39 if (err)
40 return err;
41
42 gpio_direction_output(MMC_CHIP_SELECT_GPIO, 1);
43
44 return 0;
45}
46
47static void ts72xx_mmc_spi_cleanup(struct spi_device *spi)
48{
49 gpio_set_value(MMC_CHIP_SELECT_GPIO, 1);
50 gpio_direction_input(MMC_CHIP_SELECT_GPIO);
51 gpio_free(MMC_CHIP_SELECT_GPIO);
52}
53
54static void ts72xx_mmc_spi_cs_control(struct spi_device *spi, int value)
55{
56 gpio_set_value(MMC_CHIP_SELECT_GPIO, value);
57}
58
59static struct ep93xx_spi_chip_ops ts72xx_mmc_spi_ops = {
60 .setup = ts72xx_mmc_spi_setup,
61 .cleanup = ts72xx_mmc_spi_cleanup,
62 .cs_control = ts72xx_mmc_spi_cs_control,
63};
64
65static struct spi_board_info ts72xx_spi_devices[] __initdata = {
66 {
67 .modalias = "mmc_spi",
68 .controller_data = &ts72xx_mmc_spi_ops,
69 /*
70 * We use 10 MHz even though the maximum is 7.4 MHz. The driver
71 * will limit it automatically to max. frequency.
72 */
73 .max_speed_hz = 10 * 1000 * 1000,
74 .bus_num = 0,
75 .chip_select = 0,
76 .mode = SPI_MODE_0,
77 },
78};
79
80static struct ep93xx_spi_info ts72xx_spi_info = {
81 .num_chipselect = ARRAY_SIZE(ts72xx_spi_devices),
82};
83
84static void __init ts72xx_init_machine(void)
85{
86 ...
87 ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices,
88 ARRAY_SIZE(ts72xx_spi_devices));
89}
90
91Thanks to
92=========
93Martin Guy, H. Hartley Sweeten and others who helped me during development of
94the driver. Simplemachines.it donated me a Sim.One board which I used testing
95the driver on EP9307.
diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c
index fc354f760384..36ec0774ca0b 100644
--- a/Documentation/spi/spidev_fdx.c
+++ b/Documentation/spi/spidev_fdx.c
@@ -58,10 +58,10 @@ static void do_msg(int fd, int len)
58 len = sizeof buf; 58 len = sizeof buf;
59 59
60 buf[0] = 0xaa; 60 buf[0] = 0xaa;
61 xfer[0].tx_buf = (__u64) buf; 61 xfer[0].tx_buf = (unsigned long)buf;
62 xfer[0].len = 1; 62 xfer[0].len = 1;
63 63
64 xfer[1].rx_buf = (__u64) buf; 64 xfer[1].rx_buf = (unsigned long) buf;
65 xfer[1].len = len; 65 xfer[1].len = len;
66 66
67 status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); 67 status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer);
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index df38ef046f8d..cbd05ffc606b 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -84,6 +84,16 @@ netdev_max_backlog
84Maximum number of packets, queued on the INPUT side, when the interface 84Maximum number of packets, queued on the INPUT side, when the interface
85receives packets faster than kernel can process them. 85receives packets faster than kernel can process them.
86 86
87netdev_tstamp_prequeue
88----------------------
89
90If set to 0, RX packet timestamps can be sampled after RPS processing, when
91the target CPU processes packets. It might give some delay on timestamps, but
92permit to distribute the load on several cpus.
93
94If set to 1 (default), timestamps are sampled as soon as possible, before
95queueing.
96
87optmem_max 97optmem_max
88---------- 98----------
89 99
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6c7d18c53f84..5fdbb612aeb8 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -19,6 +19,7 @@ files can be found in mm/swap.c.
19Currently, these files are in /proc/sys/vm: 19Currently, these files are in /proc/sys/vm:
20 20
21- block_dump 21- block_dump
22- compact_memory
22- dirty_background_bytes 23- dirty_background_bytes
23- dirty_background_ratio 24- dirty_background_ratio
24- dirty_bytes 25- dirty_bytes
@@ -26,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
26- dirty_ratio 27- dirty_ratio
27- dirty_writeback_centisecs 28- dirty_writeback_centisecs
28- drop_caches 29- drop_caches
30- extfrag_threshold
29- hugepages_treat_as_movable 31- hugepages_treat_as_movable
30- hugetlb_shm_group 32- hugetlb_shm_group
31- laptop_mode 33- laptop_mode
@@ -64,6 +66,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
64 66
65============================================================== 67==============================================================
66 68
69compact_memory
70
71Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
72all zones are compacted such that free memory is available in contiguous
73blocks where possible. This can be important for example in the allocation of
74huge pages although processes will also directly compact memory as required.
75
76==============================================================
77
67dirty_background_bytes 78dirty_background_bytes
68 79
69Contains the amount of dirty memory at which the pdflush background writeback 80Contains the amount of dirty memory at which the pdflush background writeback
@@ -139,6 +150,20 @@ user should run `sync' first.
139 150
140============================================================== 151==============================================================
141 152
153extfrag_threshold
154
155This parameter affects whether the kernel will compact memory or direct
156reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what
157the fragmentation index for each order is in each zone in the system. Values
158tending towards 0 imply allocations would fail due to lack of memory,
159values towards 1000 imply failures are due to fragmentation and -1 implies
160that the allocation will succeed as long as watermarks are met.
161
162The kernel will not compact memory in a zone if the
163fragmentation index is <= extfrag_threshold. The default value is 500.
164
165==============================================================
166
142hugepages_treat_as_movable 167hugepages_treat_as_movable
143 168
144This parameter is only useful when kernelcore= is specified at boot time to 169This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
index 5d8bc2cd250c..c1a1fd636bf9 100644
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -125,7 +125,7 @@ versions of the sysfs interface.
125- Block 125- Block
126 The converted block subsystem at /sys/class/block or 126 The converted block subsystem at /sys/class/block or
127 /sys/subsystem/block will contain the links for disks and partitions 127 /sys/subsystem/block will contain the links for disks and partitions
128 at the same level, never in a hierarchy. Assuming the block subsytem to 128 at the same level, never in a hierarchy. Assuming the block subsystem to
129 contain only disks and not partition devices in the same flat list is 129 contain only disks and not partition devices in the same flat list is
130 a bug in the application. 130 a bug in the application.
131 131
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index d56a01775423..5c17196c8fe9 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -177,13 +177,13 @@ virtual console (ALT+Fn) and then back again should also help.
177 177
178* I hit SysRq, but nothing seems to happen, what's wrong? 178* I hit SysRq, but nothing seems to happen, what's wrong?
179~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 179~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
180There are some keyboards that send different scancodes for SysRq than the 180There are some keyboards that produce a different keycode for SysRq than the
181pre-defined 0x54. So if SysRq doesn't work out of the box for a certain 181pre-defined value of 99 (see KEY_SYSRQ in include/linux/input.h), or which
182keyboard, run 'showkey -s' to find out the proper scancode sequence. Then 182don't have a SysRq key at all. In these cases, run 'showkey -s' to find an
183use 'setkeycodes <sequence> 84' to define this sequence to the usual SysRq 183appropriate scancode sequence, and use 'setkeycodes <sequence> 99' to map
184code (84 is decimal for 0x54). It's probably best to put this command in a 184this sequence to the usual SysRq code (e.g., 'setkeycodes e05b 99'). It's
185boot script. Oh, and by the way, you exit 'showkey' by not typing anything 185probably best to put this command in a boot script. Oh, and by the way, you
186for ten seconds. 186exit 'showkey' by not typing anything for ten seconds.
187 187
188* I want to add SysRQ key events to a module, how does it work? 188* I want to add SysRQ key events to a module, how does it work?
189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/timers/Makefile b/Documentation/timers/Makefile
index c85625f4ab25..73f75f8a87dc 100644
--- a/Documentation/timers/Makefile
+++ b/Documentation/timers/Makefile
@@ -2,7 +2,7 @@
2obj- := dummy.o 2obj- := dummy.o
3 3
4# List of programs to build 4# List of programs to build
5hostprogs-y := hpet_example 5hostprogs-$(CONFIG_X86) := hpet_example
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c
index f9ce2d9fdfd5..4bfafb7bc4c5 100644
--- a/Documentation/timers/hpet_example.c
+++ b/Documentation/timers/hpet_example.c
@@ -10,7 +10,6 @@
10#include <sys/types.h> 10#include <sys/types.h>
11#include <sys/wait.h> 11#include <sys/wait.h>
12#include <signal.h> 12#include <signal.h>
13#include <fcntl.h>
14#include <errno.h> 13#include <errno.h>
15#include <sys/time.h> 14#include <sys/time.h>
16#include <linux/hpet.h> 15#include <linux/hpet.h>
@@ -24,7 +23,6 @@ extern void hpet_read(int, const char **);
24 23
25#include <sys/poll.h> 24#include <sys/poll.h>
26#include <sys/ioctl.h> 25#include <sys/ioctl.h>
27#include <signal.h>
28 26
29struct hpet_command { 27struct hpet_command {
30 char *command; 28 char *command;
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2d..09bd8e902989 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
90 90
91 trace_event=[event-list] 91 trace_event=[event-list]
92 92
93The format of this boot option is the same as described in section 2.1. 93event-list is a comma separated list of events. See section 2.1 for event
94format.
94 95
953. Defining an event-enabled tracepoint 963. Defining an event-enabled tracepoint
96======================================= 97=======================================
@@ -238,7 +239,7 @@ subsystem's filter file.
238 239
239For convenience, filters for every event in a subsystem can be set or 240For convenience, filters for every event in a subsystem can be set or
240cleared as a group by writing a filter expression into the filter file 241cleared as a group by writing a filter expression into the filter file
241at the root of the subsytem. Note however, that if a filter for any 242at the root of the subsystem. Note however, that if a filter for any
242event within the subsystem lacks a field specified in the subsystem 243event within the subsystem lacks a field specified in the subsystem
243filter, or if the filter can't be applied for any other reason, the 244filter, or if the filter can't be applied for any other reason, the
244filter for that event will retain its previous setting. This can 245filter for that event will retain its previous setting. This can
@@ -250,7 +251,7 @@ fields can be guaranteed to propagate successfully to all events.
250Here are a few subsystem filter examples that also illustrate the 251Here are a few subsystem filter examples that also illustrate the
251above points: 252above points:
252 253
253Clear the filters on all events in the sched subsytem: 254Clear the filters on all events in the sched subsystem:
254 255
255# cd /sys/kernel/debug/tracing/events/sched 256# cd /sys/kernel/debug/tracing/events/sched
256# echo 0 > filter 257# echo 0 > filter
@@ -260,7 +261,7 @@ none
260none 261none
261 262
262Set a filter using only common fields for all events in the sched 263Set a filter using only common fields for all events in the sched
263subsytem (all events end up with the same filter): 264subsystem (all events end up with the same filter):
264 265
265# cd /sys/kernel/debug/tracing/events/sched 266# cd /sys/kernel/debug/tracing/events/sched
266# echo common_pid == 0 > filter 267# echo common_pid == 0 > filter
@@ -270,7 +271,7 @@ common_pid == 0
270common_pid == 0 271common_pid == 0
271 272
272Attempt to set a filter using a non-common field for all events in the 273Attempt to set a filter using a non-common field for all events in the
273sched subsytem (all events but those that have a prev_pid field retain 274sched subsystem (all events but those that have a prev_pid field retain
274their old filters): 275their old filters):
275 276
276# cd /sys/kernel/debug/tracing/events/sched 277# cd /sys/kernel/debug/tracing/events/sched
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd797..557c1edeccaf 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
155 to be traced. Echoing names of functions into this file 155 to be traced. Echoing names of functions into this file
156 will limit the trace to only those functions. 156 will limit the trace to only those functions.
157 157
158 This interface also allows for commands to be used. See the
159 "Filter commands" section for more details.
160
158 set_ftrace_notrace: 161 set_ftrace_notrace:
159 162
160 This has an effect opposite to that of 163 This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1337can either use the sysctl function or set it via the proc system 1340can either use the sysctl function or set it via the proc system
1338interface. 1341interface.
1339 1342
1340 sysctl kernel.ftrace_dump_on_oops=1 1343 sysctl kernel.ftrace_dump_on_oops=n
1341 1344
1342or 1345or
1343 1346
1344 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops 1347 echo n > /proc/sys/kernel/ftrace_dump_on_oops
1345 1348
1349If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
1350only dump the buffer of the CPU that triggered the oops.
1346 1351
1347Here's an example of such a dump after a null pointer 1352Here's an example of such a dump after a null pointer
1348dereference in a kernel module: 1353dereference in a kernel module:
@@ -1822,6 +1827,47 @@ this special filter via:
1822 echo > set_graph_function 1827 echo > set_graph_function
1823 1828
1824 1829
1830Filter commands
1831---------------
1832
1833A few commands are supported by the set_ftrace_filter interface.
1834Trace commands have the following format:
1835
1836<function>:<command>:<parameter>
1837
1838The following commands are supported:
1839
1840- mod
1841 This command enables function filtering per module. The
1842 parameter defines the module. For example, if only the write*
1843 functions in the ext3 module are desired, run:
1844
1845 echo 'write*:mod:ext3' > set_ftrace_filter
1846
1847 This command interacts with the filter in the same way as
1848 filtering based on function names. Thus, adding more functions
1849 in a different module is accomplished by appending (>>) to the
1850 filter file. Remove specific module functions by prepending
1851 '!':
1852
1853 echo '!writeback*:mod:ext3' >> set_ftrace_filter
1854
1855- traceon/traceoff
1856 These commands turn tracing on and off when the specified
1857 functions are hit. The parameter determines how many times the
1858 tracing system is turned on and off. If unspecified, there is
1859 no limit. For example, to disable tracing when a schedule bug
1860 is hit the first 5 times, run:
1861
1862 echo '__schedule_bug:traceoff:5' > set_ftrace_filter
1863
1864 These commands are cumulative whether or not they are appended
1865 to set_ftrace_filter. To remove a command, prepend it by '!'
1866 and drop the parameter:
1867
1868 echo '!__schedule_bug:traceoff' > set_ftrace_filter
1869
1870
1825trace_pipe 1871trace_pipe
1826---------- 1872----------
1827 1873
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index a9100b28eb84..ec94748ae65b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -40,7 +40,9 @@ Synopsis of kprobe_events
40 $stack : Fetch stack address. 40 $stack : Fetch stack address.
41 $retval : Fetch return value.(*) 41 $retval : Fetch return value.(*)
42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) 42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
43 NAME=FETCHARG: Set NAME as the argument name of FETCHARG. 43 NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
44 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
45 (u8/u16/u32/u64/s8/s16/s32/s64) are supported.
44 46
45 (*) only for return probe. 47 (*) only for return probe.
46 (**) this is useful for fetching a field of data structures. 48 (**) this is useful for fetching a field of data structures.
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
index c480e9c32dbd..4c5e37939344 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -381,7 +381,7 @@ descriptor that gives us the status of the transfer, its identification
381we issue another URB to read into the destination buffer the chunk of 381we issue another URB to read into the destination buffer the chunk of
382data coming out of the remote endpoint. Done, wait for the next guy. The 382data coming out of the remote endpoint. Done, wait for the next guy. The
383callbacks for the URBs issued from here are the ones that will declare 383callbacks for the URBs issued from here are the ones that will declare
384the xfer complete at some point and call it's callback. 384the xfer complete at some point and call its callback.
385 385
386Seems simple, but the implementation is not trivial. 386Seems simple, but the implementation is not trivial.
387 387
diff --git a/Documentation/usb/bulk-streams.txt b/Documentation/usb/bulk-streams.txt
new file mode 100644
index 000000000000..ffc02021863e
--- /dev/null
+++ b/Documentation/usb/bulk-streams.txt
@@ -0,0 +1,78 @@
1Background
2==========
3
4Bulk endpoint streams were added in the USB 3.0 specification. Streams allow a
5device driver to overload a bulk endpoint so that multiple transfers can be
6queued at once.
7
8Streams are defined in sections 4.4.6.4 and 8.12.1.4 of the Universal Serial Bus
93.0 specification at http://www.usb.org/developers/docs/ The USB Attached SCSI
10Protocol, which uses streams to queue multiple SCSI commands, can be found on
11the T10 website (http://t10.org/).
12
13
14Device-side implications
15========================
16
17Once a buffer has been queued to a stream ring, the device is notified (through
18an out-of-band mechanism on another endpoint) that data is ready for that stream
19ID. The device then tells the host which "stream" it wants to start. The host
20can also initiate a transfer on a stream without the device asking, but the
21device can refuse that transfer. Devices can switch between streams at any
22time.
23
24
25Driver implications
26===================
27
28int usb_alloc_streams(struct usb_interface *interface,
29 struct usb_host_endpoint **eps, unsigned int num_eps,
30 unsigned int num_streams, gfp_t mem_flags);
31
32Device drivers will call this API to request that the host controller driver
33allocate memory so the driver can use up to num_streams stream IDs. They must
34pass an array of usb_host_endpoints that need to be setup with similar stream
35IDs. This is to ensure that a UASP driver will be able to use the same stream
36ID for the bulk IN and OUT endpoints used in a Bi-directional command sequence.
37
38The return value is an error condition (if one of the endpoints doesn't support
39streams, or the xHCI driver ran out of memory), or the number of streams the
40host controller allocated for this endpoint. The xHCI host controller hardware
41declares how many stream IDs it can support, and each bulk endpoint on a
42SuperSpeed device will say how many stream IDs it can handle. Therefore,
43drivers should be able to deal with being allocated less stream IDs than they
44requested.
45
46Do NOT call this function if you have URBs enqueued for any of the endpoints
47passed in as arguments. Do not call this function to request less than two
48streams.
49
50Drivers will only be allowed to call this API once for the same endpoint
51without calling usb_free_streams(). This is a simplification for the xHCI host
52controller driver, and may change in the future.
53
54
55Picking new Stream IDs to use
56============================
57
58Stream ID 0 is reserved, and should not be used to communicate with devices. If
59usb_alloc_streams() returns with a value of N, you may use streams 1 though N.
60To queue an URB for a specific stream, set the urb->stream_id value. If the
61endpoint does not support streams, an error will be returned.
62
63Note that new API to choose the next stream ID will have to be added if the xHCI
64driver supports secondary stream IDs.
65
66
67Clean up
68========
69
70If a driver wishes to stop using streams to communicate with the device, it
71should call
72
73void usb_free_streams(struct usb_interface *interface,
74 struct usb_host_endpoint **eps, unsigned int num_eps,
75 gfp_t mem_flags);
76
77All stream IDs will be deallocated when the driver releases the interface, to
78ensure that drivers that don't support streams will be able to use the endpoint.
diff --git a/Documentation/usb/dma.txt b/Documentation/usb/dma.txt
index cfdcd16e3abf..84ef865237db 100644
--- a/Documentation/usb/dma.txt
+++ b/Documentation/usb/dma.txt
@@ -16,11 +16,11 @@ OR: they can now be DMA-aware.
16 manage dma mappings for existing dma-ready buffers (see below). 16 manage dma mappings for existing dma-ready buffers (see below).
17 17
18- URBs have an additional "transfer_dma" field, as well as a transfer_flags 18- URBs have an additional "transfer_dma" field, as well as a transfer_flags
19 bit saying if it's valid. (Control requests also have "setup_dma" and a 19 bit saying if it's valid. (Control requests also have "setup_dma", but
20 corresponding transfer_flags bit.) 20 drivers must not use it.)
21 21
22- "usbcore" will map those DMA addresses, if a DMA-aware driver didn't do 22- "usbcore" will map this DMA address, if a DMA-aware driver didn't do
23 it first and set URB_NO_TRANSFER_DMA_MAP or URB_NO_SETUP_DMA_MAP. HCDs 23 it first and set URB_NO_TRANSFER_DMA_MAP. HCDs
24 don't manage dma mappings for URBs. 24 don't manage dma mappings for URBs.
25 25
26- There's a new "generic DMA API", parts of which are usable by USB device 26- There's a new "generic DMA API", parts of which are usable by USB device
@@ -43,22 +43,16 @@ and effects like cache-trashing can impose subtle penalties.
43 kind of addresses to store in urb->transfer_buffer and urb->transfer_dma. 43 kind of addresses to store in urb->transfer_buffer and urb->transfer_dma.
44 You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags: 44 You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags:
45 45
46 void *usb_buffer_alloc (struct usb_device *dev, size_t size, 46 void *usb_alloc_coherent (struct usb_device *dev, size_t size,
47 int mem_flags, dma_addr_t *dma); 47 int mem_flags, dma_addr_t *dma);
48 48
49 void usb_buffer_free (struct usb_device *dev, size_t size, 49 void usb_free_coherent (struct usb_device *dev, size_t size,
50 void *addr, dma_addr_t dma); 50 void *addr, dma_addr_t dma);
51 51
52 Most drivers should *NOT* be using these primitives; they don't need 52 Most drivers should *NOT* be using these primitives; they don't need
53 to use this type of memory ("dma-coherent"), and memory returned from 53 to use this type of memory ("dma-coherent"), and memory returned from
54 kmalloc() will work just fine. 54 kmalloc() will work just fine.
55 55
56 For control transfers you can use the buffer primitives or not for each
57 of the transfer buffer and setup buffer independently. Set the flag bits
58 URB_NO_TRANSFER_DMA_MAP and URB_NO_SETUP_DMA_MAP to indicate which
59 buffers you have prepared. For non-control transfers URB_NO_SETUP_DMA_MAP
60 is ignored.
61
62 The memory buffer returned is "dma-coherent"; sometimes you might need to 56 The memory buffer returned is "dma-coherent"; sometimes you might need to
63 force a consistent memory access ordering by using memory barriers. It's 57 force a consistent memory access ordering by using memory barriers. It's
64 not using a streaming DMA mapping, so it's good for small transfers on 58 not using a streaming DMA mapping, so it's good for small transfers on
@@ -130,8 +124,8 @@ of Documentation/PCI/PCI-DMA-mapping.txt, titled "What memory is DMA-able?")
130 void usb_buffer_unmap (struct urb *urb); 124 void usb_buffer_unmap (struct urb *urb);
131 125
132 The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP 126 The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP
133 so that usbcore won't map or unmap the buffer. The same goes for 127 so that usbcore won't map or unmap the buffer. They cannot be used for
134 urb->setup_dma and URB_NO_SETUP_DMA_MAP for control requests. 128 setup_packet buffers in control requests.
135 129
136Note that several of those interfaces are currently commented out, since 130Note that several of those interfaces are currently commented out, since
137they don't have current users. See the source code. Other than the dmasync 131they don't have current users. See the source code. Other than the dmasync
diff --git a/Documentation/usb/gadget_hid.txt b/Documentation/usb/gadget_hid.txt
new file mode 100644
index 000000000000..f4a51f567427
--- /dev/null
+++ b/Documentation/usb/gadget_hid.txt
@@ -0,0 +1,445 @@
1
2 Linux USB HID gadget driver
3
4Introduction
5
6 The HID Gadget driver provides emulation of USB Human Interface
7 Devices (HID). The basic HID handling is done in the kernel,
8 and HID reports can be sent/received through I/O on the
9 /dev/hidgX character devices.
10
11 For more details about HID, see the developer page on
12 http://www.usb.org/developers/hidpage/
13
14Configuration
15
16 g_hid is a platform driver, so to use it you need to add
17 struct platform_device(s) to your platform code defining the
18 HID function descriptors you want to use - E.G. something
19 like:
20
21#include <linux/platform_device.h>
22#include <linux/usb/g_hid.h>
23
24/* hid descriptor for a keyboard */
25static struct hidg_func_descriptor my_hid_data = {
26 .subclass = 0, /* No subclass */
27 .protocol = 1, /* Keyboard */
28 .report_length = 8,
29 .report_desc_length = 63,
30 .report_desc = {
31 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
32 0x09, 0x06, /* USAGE (Keyboard) */
33 0xa1, 0x01, /* COLLECTION (Application) */
34 0x05, 0x07, /* USAGE_PAGE (Keyboard) */
35 0x19, 0xe0, /* USAGE_MINIMUM (Keyboard LeftControl) */
36 0x29, 0xe7, /* USAGE_MAXIMUM (Keyboard Right GUI) */
37 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
38 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
39 0x75, 0x01, /* REPORT_SIZE (1) */
40 0x95, 0x08, /* REPORT_COUNT (8) */
41 0x81, 0x02, /* INPUT (Data,Var,Abs) */
42 0x95, 0x01, /* REPORT_COUNT (1) */
43 0x75, 0x08, /* REPORT_SIZE (8) */
44 0x81, 0x03, /* INPUT (Cnst,Var,Abs) */
45 0x95, 0x05, /* REPORT_COUNT (5) */
46 0x75, 0x01, /* REPORT_SIZE (1) */
47 0x05, 0x08, /* USAGE_PAGE (LEDs) */
48 0x19, 0x01, /* USAGE_MINIMUM (Num Lock) */
49 0x29, 0x05, /* USAGE_MAXIMUM (Kana) */
50 0x91, 0x02, /* OUTPUT (Data,Var,Abs) */
51 0x95, 0x01, /* REPORT_COUNT (1) */
52 0x75, 0x03, /* REPORT_SIZE (3) */
53 0x91, 0x03, /* OUTPUT (Cnst,Var,Abs) */
54 0x95, 0x06, /* REPORT_COUNT (6) */
55 0x75, 0x08, /* REPORT_SIZE (8) */
56 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
57 0x25, 0x65, /* LOGICAL_MAXIMUM (101) */
58 0x05, 0x07, /* USAGE_PAGE (Keyboard) */
59 0x19, 0x00, /* USAGE_MINIMUM (Reserved) */
60 0x29, 0x65, /* USAGE_MAXIMUM (Keyboard Application) */
61 0x81, 0x00, /* INPUT (Data,Ary,Abs) */
62 0xc0 /* END_COLLECTION */
63 }
64};
65
66static struct platform_device my_hid = {
67 .name = "hidg",
68 .id = 0,
69 .num_resources = 0,
70 .resource = 0,
71 .dev.platform_data = &my_hid_data,
72};
73
74 You can add as many HID functions as you want, only limited by
75 the amount of interrupt endpoints your gadget driver supports.
76
77Send and receive HID reports
78
79 HID reports can be sent/received using read/write on the
80 /dev/hidgX character devices. See below for an example program
81 to do this.
82
83 hid_gadget_test is a small interactive program to test the HID
84 gadget driver. To use, point it at a hidg device and set the
85 device type (keyboard / mouse / joystick) - E.G.:
86
87 # hid_gadget_test /dev/hidg0 keyboard
88
89 You are now in the prompt of hid_gadget_test. You can type any
90 combination of options and values. Available options and
91 values are listed at program start. In keyboard mode you can
92 send up to six values.
93
94 For example type: g i s t r --left-shift
95
96 Hit return and the corresponding report will be sent by the
97 HID gadget.
98
99 Another interesting example is the caps lock test. Type
100 -–caps-lock and hit return. A report is then sent by the
101 gadget and you should receive the host answer, corresponding
102 to the caps lock LED status.
103
104 --caps-lock
105 recv report:2
106
107 With this command:
108
109 # hid_gadget_test /dev/hidg1 mouse
110
111 You can test the mouse emulation. Values are two signed numbers.
112
113
114Sample code
115
116/* hid_gadget_test */
117
118#include <pthread.h>
119#include <string.h>
120#include <stdio.h>
121#include <ctype.h>
122#include <fcntl.h>
123#include <errno.h>
124#include <stdio.h>
125#include <stdlib.h>
126#include <unistd.h>
127
128#define BUF_LEN 512
129
130struct options {
131 const char *opt;
132 unsigned char val;
133};
134
135static struct options kmod[] = {
136 {.opt = "--left-ctrl", .val = 0x01},
137 {.opt = "--right-ctrl", .val = 0x10},
138 {.opt = "--left-shift", .val = 0x02},
139 {.opt = "--right-shift", .val = 0x20},
140 {.opt = "--left-alt", .val = 0x04},
141 {.opt = "--right-alt", .val = 0x40},
142 {.opt = "--left-meta", .val = 0x08},
143 {.opt = "--right-meta", .val = 0x80},
144 {.opt = NULL}
145};
146
147static struct options kval[] = {
148 {.opt = "--return", .val = 0x28},
149 {.opt = "--esc", .val = 0x29},
150 {.opt = "--bckspc", .val = 0x2a},
151 {.opt = "--tab", .val = 0x2b},
152 {.opt = "--spacebar", .val = 0x2c},
153 {.opt = "--caps-lock", .val = 0x39},
154 {.opt = "--f1", .val = 0x3a},
155 {.opt = "--f2", .val = 0x3b},
156 {.opt = "--f3", .val = 0x3c},
157 {.opt = "--f4", .val = 0x3d},
158 {.opt = "--f5", .val = 0x3e},
159 {.opt = "--f6", .val = 0x3f},
160 {.opt = "--f7", .val = 0x40},
161 {.opt = "--f8", .val = 0x41},
162 {.opt = "--f9", .val = 0x42},
163 {.opt = "--f10", .val = 0x43},
164 {.opt = "--f11", .val = 0x44},
165 {.opt = "--f12", .val = 0x45},
166 {.opt = "--insert", .val = 0x49},
167 {.opt = "--home", .val = 0x4a},
168 {.opt = "--pageup", .val = 0x4b},
169 {.opt = "--del", .val = 0x4c},
170 {.opt = "--end", .val = 0x4d},
171 {.opt = "--pagedown", .val = 0x4e},
172 {.opt = "--right", .val = 0x4f},
173 {.opt = "--left", .val = 0x50},
174 {.opt = "--down", .val = 0x51},
175 {.opt = "--kp-enter", .val = 0x58},
176 {.opt = "--up", .val = 0x52},
177 {.opt = "--num-lock", .val = 0x53},
178 {.opt = NULL}
179};
180
181int keyboard_fill_report(char report[8], char buf[BUF_LEN], int *hold)
182{
183 char *tok = strtok(buf, " ");
184 int key = 0;
185 int i = 0;
186
187 for (; tok != NULL; tok = strtok(NULL, " ")) {
188
189 if (strcmp(tok, "--quit") == 0)
190 return -1;
191
192 if (strcmp(tok, "--hold") == 0) {
193 *hold = 1;
194 continue;
195 }
196
197 if (key < 6) {
198 for (i = 0; kval[i].opt != NULL; i++)
199 if (strcmp(tok, kval[i].opt) == 0) {
200 report[2 + key++] = kval[i].val;
201 break;
202 }
203 if (kval[i].opt != NULL)
204 continue;
205 }
206
207 if (key < 6)
208 if (islower(tok[0])) {
209 report[2 + key++] = (tok[0] - ('a' - 0x04));
210 continue;
211 }
212
213 for (i = 0; kmod[i].opt != NULL; i++)
214 if (strcmp(tok, kmod[i].opt) == 0) {
215 report[0] = report[0] | kmod[i].val;
216 break;
217 }
218 if (kmod[i].opt != NULL)
219 continue;
220
221 if (key < 6)
222 fprintf(stderr, "unknown option: %s\n", tok);
223 }
224 return 8;
225}
226
227static struct options mmod[] = {
228 {.opt = "--b1", .val = 0x01},
229 {.opt = "--b2", .val = 0x02},
230 {.opt = "--b3", .val = 0x04},
231 {.opt = NULL}
232};
233
234int mouse_fill_report(char report[8], char buf[BUF_LEN], int *hold)
235{
236 char *tok = strtok(buf, " ");
237 int mvt = 0;
238 int i = 0;
239 for (; tok != NULL; tok = strtok(NULL, " ")) {
240
241 if (strcmp(tok, "--quit") == 0)
242 return -1;
243
244 if (strcmp(tok, "--hold") == 0) {
245 *hold = 1;
246 continue;
247 }
248
249 for (i = 0; mmod[i].opt != NULL; i++)
250 if (strcmp(tok, mmod[i].opt) == 0) {
251 report[0] = report[0] | mmod[i].val;
252 break;
253 }
254 if (mmod[i].opt != NULL)
255 continue;
256
257 if (!(tok[0] == '-' && tok[1] == '-') && mvt < 2) {
258 errno = 0;
259 report[1 + mvt++] = (char)strtol(tok, NULL, 0);
260 if (errno != 0) {
261 fprintf(stderr, "Bad value:'%s'\n", tok);
262 report[1 + mvt--] = 0;
263 }
264 continue;
265 }
266
267 fprintf(stderr, "unknown option: %s\n", tok);
268 }
269 return 3;
270}
271
272static struct options jmod[] = {
273 {.opt = "--b1", .val = 0x10},
274 {.opt = "--b2", .val = 0x20},
275 {.opt = "--b3", .val = 0x40},
276 {.opt = "--b4", .val = 0x80},
277 {.opt = "--hat1", .val = 0x00},
278 {.opt = "--hat2", .val = 0x01},
279 {.opt = "--hat3", .val = 0x02},
280 {.opt = "--hat4", .val = 0x03},
281 {.opt = "--hatneutral", .val = 0x04},
282 {.opt = NULL}
283};
284
285int joystick_fill_report(char report[8], char buf[BUF_LEN], int *hold)
286{
287 char *tok = strtok(buf, " ");
288 int mvt = 0;
289 int i = 0;
290
291 *hold = 1;
292
293 /* set default hat position: neutral */
294 report[3] = 0x04;
295
296 for (; tok != NULL; tok = strtok(NULL, " ")) {
297
298 if (strcmp(tok, "--quit") == 0)
299 return -1;
300
301 for (i = 0; jmod[i].opt != NULL; i++)
302 if (strcmp(tok, jmod[i].opt) == 0) {
303 report[3] = (report[3] & 0xF0) | jmod[i].val;
304 break;
305 }
306 if (jmod[i].opt != NULL)
307 continue;
308
309 if (!(tok[0] == '-' && tok[1] == '-') && mvt < 3) {
310 errno = 0;
311 report[mvt++] = (char)strtol(tok, NULL, 0);
312 if (errno != 0) {
313 fprintf(stderr, "Bad value:'%s'\n", tok);
314 report[mvt--] = 0;
315 }
316 continue;
317 }
318
319 fprintf(stderr, "unknown option: %s\n", tok);
320 }
321 return 4;
322}
323
324void print_options(char c)
325{
326 int i = 0;
327
328 if (c == 'k') {
329 printf(" keyboard options:\n"
330 " --hold\n");
331 for (i = 0; kmod[i].opt != NULL; i++)
332 printf("\t\t%s\n", kmod[i].opt);
333 printf("\n keyboard values:\n"
334 " [a-z] or\n");
335 for (i = 0; kval[i].opt != NULL; i++)
336 printf("\t\t%-8s%s", kval[i].opt, i % 2 ? "\n" : "");
337 printf("\n");
338 } else if (c == 'm') {
339 printf(" mouse options:\n"
340 " --hold\n");
341 for (i = 0; mmod[i].opt != NULL; i++)
342 printf("\t\t%s\n", mmod[i].opt);
343 printf("\n mouse values:\n"
344 " Two signed numbers\n"
345 "--quit to close\n");
346 } else {
347 printf(" joystick options:\n");
348 for (i = 0; jmod[i].opt != NULL; i++)
349 printf("\t\t%s\n", jmod[i].opt);
350 printf("\n joystick values:\n"
351 " three signed numbers\n"
352 "--quit to close\n");
353 }
354}
355
356int main(int argc, const char *argv[])
357{
358 const char *filename = NULL;
359 int fd = 0;
360 char buf[BUF_LEN];
361 int cmd_len;
362 char report[8];
363 int to_send = 8;
364 int hold = 0;
365 fd_set rfds;
366 int retval, i;
367
368 if (argc < 3) {
369 fprintf(stderr, "Usage: %s devname mouse|keyboard|joystick\n",
370 argv[0]);
371 return 1;
372 }
373
374 if (argv[2][0] != 'k' && argv[2][0] != 'm' && argv[2][0] != 'j')
375 return 2;
376
377 filename = argv[1];
378
379 if ((fd = open(filename, O_RDWR, 0666)) == -1) {
380 perror(filename);
381 return 3;
382 }
383
384 print_options(argv[2][0]);
385
386 while (42) {
387
388 FD_ZERO(&rfds);
389 FD_SET(STDIN_FILENO, &rfds);
390 FD_SET(fd, &rfds);
391
392 retval = select(fd + 1, &rfds, NULL, NULL, NULL);
393 if (retval == -1 && errno == EINTR)
394 continue;
395 if (retval < 0) {
396 perror("select()");
397 return 4;
398 }
399
400 if (FD_ISSET(fd, &rfds)) {
401 cmd_len = read(fd, buf, BUF_LEN - 1);
402 printf("recv report:");
403 for (i = 0; i < cmd_len; i++)
404 printf(" %02x", buf[i]);
405 printf("\n");
406 }
407
408 if (FD_ISSET(STDIN_FILENO, &rfds)) {
409 memset(report, 0x0, sizeof(report));
410 cmd_len = read(STDIN_FILENO, buf, BUF_LEN - 1);
411
412 if (cmd_len == 0)
413 break;
414
415 buf[cmd_len - 1] = '\0';
416 hold = 0;
417
418 memset(report, 0x0, sizeof(report));
419 if (argv[2][0] == 'k')
420 to_send = keyboard_fill_report(report, buf, &hold);
421 else if (argv[2][0] == 'm')
422 to_send = mouse_fill_report(report, buf, &hold);
423 else
424 to_send = joystick_fill_report(report, buf, &hold);
425
426 if (to_send == -1)
427 break;
428
429 if (write(fd, report, to_send) != to_send) {
430 perror(filename);
431 return 5;
432 }
433 if (!hold) {
434 memset(report, 0x0, sizeof(report));
435 if (write(fd, report, to_send) != to_send) {
436 perror(filename);
437 return 6;
438 }
439 }
440 }
441 }
442
443 close(fd);
444 return 0;
445}
diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt
index 2790ad48cfc2..b29d8e56cf28 100644
--- a/Documentation/usb/power-management.txt
+++ b/Documentation/usb/power-management.txt
@@ -107,7 +107,9 @@ allowed to issue dynamic suspends.
107The user interface for controlling dynamic PM is located in the power/ 107The user interface for controlling dynamic PM is located in the power/
108subdirectory of each USB device's sysfs directory, that is, in 108subdirectory of each USB device's sysfs directory, that is, in
109/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The 109/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The
110relevant attribute files are: wakeup, level, and autosuspend. 110relevant attribute files are: wakeup, control, and autosuspend.
111(There may also be a file named "level"; this file was deprecated
112as of the 2.6.35 kernel and replaced by the "control" file.)
111 113
112 power/wakeup 114 power/wakeup
113 115
@@ -120,7 +122,7 @@ relevant attribute files are: wakeup, level, and autosuspend.
120 while the device is suspended, the change won't take 122 while the device is suspended, the change won't take
121 effect until the following suspend.) 123 effect until the following suspend.)
122 124
123 power/level 125 power/control
124 126
125 This file contains one of two words: "on" or "auto". 127 This file contains one of two words: "on" or "auto".
126 You can write those words to the file to change the 128 You can write those words to the file to change the
@@ -148,14 +150,15 @@ relevant attribute files are: wakeup, level, and autosuspend.
148 never to autosuspend. You can write a number to the 150 never to autosuspend. You can write a number to the
149 file to change the autosuspend idle-delay time. 151 file to change the autosuspend idle-delay time.
150 152
151Writing "-1" to power/autosuspend and writing "on" to power/level do 153Writing "-1" to power/autosuspend and writing "on" to power/control do
152essentially the same thing -- they both prevent the device from being 154essentially the same thing -- they both prevent the device from being
153autosuspended. Yes, this is a redundancy in the API. 155autosuspended. Yes, this is a redundancy in the API.
154 156
155(In 2.6.21 writing "0" to power/autosuspend would prevent the device 157(In 2.6.21 writing "0" to power/autosuspend would prevent the device
156from being autosuspended; the behavior was changed in 2.6.22. The 158from being autosuspended; the behavior was changed in 2.6.22. The
157power/autosuspend attribute did not exist prior to 2.6.21, and the 159power/autosuspend attribute did not exist prior to 2.6.21, and the
158power/level attribute did not exist prior to 2.6.22.) 160power/level attribute did not exist prior to 2.6.22. power/control
161was added in 2.6.34.)
159 162
160 163
161 Changing the default idle-delay time 164 Changing the default idle-delay time
@@ -212,7 +215,7 @@ among printers and scanners, but plenty of other types of device have
212the same deficiency. 215the same deficiency.
213 216
214For this reason, by default the kernel disables autosuspend (the 217For this reason, by default the kernel disables autosuspend (the
215power/level attribute is initialized to "on") for all devices other 218power/control attribute is initialized to "on") for all devices other
216than hubs. Hubs, at least, appear to be reasonably well-behaved in 219than hubs. Hubs, at least, appear to be reasonably well-behaved in
217this regard. 220this regard.
218 221
@@ -373,7 +376,7 @@ usb_autopm_put_interface() in its close or release routine. But other
373patterns are possible. 376patterns are possible.
374 377
375The autosuspend attempts mentioned above will often fail for one 378The autosuspend attempts mentioned above will often fail for one
376reason or another. For example, the power/level attribute might be 379reason or another. For example, the power/control attribute might be
377set to "on", or another interface in the same device might not be 380set to "on", or another interface in the same device might not be
378idle. This is perfectly normal. If the reason for failure was that 381idle. This is perfectly normal. If the reason for failure was that
379the device hasn't been idle for long enough, a timer is scheduled to 382the device hasn't been idle for long enough, a timer is scheduled to
@@ -394,12 +397,12 @@ Drivers can enable autosuspend for their devices by calling
394 397
395in their probe() routine, if they know that the device is capable of 398in their probe() routine, if they know that the device is capable of
396suspending and resuming correctly. This is exactly equivalent to 399suspending and resuming correctly. This is exactly equivalent to
397writing "auto" to the device's power/level attribute. Likewise, 400writing "auto" to the device's power/control attribute. Likewise,
398drivers can disable autosuspend by calling 401drivers can disable autosuspend by calling
399 402
400 usb_disable_autosuspend(struct usb_device *udev); 403 usb_disable_autosuspend(struct usb_device *udev);
401 404
402This is exactly the same as writing "on" to the power/level attribute. 405This is exactly the same as writing "on" to the power/control attribute.
403 406
404Sometimes a driver needs to make sure that remote wakeup is enabled 407Sometimes a driver needs to make sure that remote wakeup is enabled
405during autosuspend. For example, there's not much point 408during autosuspend. For example, there's not much point
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index ff2c1ff57ba2..f4d214510259 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -194,6 +194,10 @@ FTDI Single Port Serial Driver
194 194
195 This is a single port DB-25 serial adapter. 195 This is a single port DB-25 serial adapter.
196 196
197 Devices supported include:
198 -TripNav TN-200 USB GPS
199 -Navis Engineering Bureau CH-4711 USB GPS
200
197 For any questions or problems with this driver, please contact Bill Ryder. 201 For any questions or problems with this driver, please contact Bill Ryder.
198 202
199 203
@@ -216,7 +220,7 @@ Cypress M8 CY4601 Family Serial Driver
216 220
217 Devices supported: 221 Devices supported:
218 222
219 -DeLorme's USB Earthmate (SiRF Star II lp arch) 223 -DeLorme's USB Earthmate GPS (SiRF Star II lp arch)
220 -Cypress HID->COM RS232 adapter 224 -Cypress HID->COM RS232 adapter
221 225
222 Note: Cypress Semiconductor claims no affiliation with the 226 Note: Cypress Semiconductor claims no affiliation with the
@@ -392,9 +396,10 @@ REINER SCT cyberJack pinpad/e-com USB chipcard reader
392Prolific PL2303 Driver 396Prolific PL2303 Driver
393 397
394 This driver supports any device that has the PL2303 chip from Prolific 398 This driver supports any device that has the PL2303 chip from Prolific
395 in it. This includes a number of single port USB to serial 399 in it. This includes a number of single port USB to serial converters,
396 converters and USB GPS devices. Devices from Aten (the UC-232) and 400 more than 70% of USB GPS devices (in 2010), and some USB UPSes. Devices
397 IO-Data work with this driver, as does the DCU-11 mobile-phone cable. 401 from Aten (the UC-232) and IO-Data work with this driver, as does
402 the DCU-11 mobile-phone cable.
398 403
399 For any questions or problems with this driver, please contact Greg 404 For any questions or problems with this driver, please contact Greg
400 Kroah-Hartman at greg@kroah.com 405 Kroah-Hartman at greg@kroah.com
@@ -435,6 +440,22 @@ Winchiphead CH341 Driver
435 For any questions or problems with this driver, please contact 440 For any questions or problems with this driver, please contact
436 frank@kingswood-consulting.co.uk. 441 frank@kingswood-consulting.co.uk.
437 442
443Moschip MCS7720, MCS7715 driver
444
445 These chips are present in devices sold by various manufacturers, such as Syba
446 and Cables Unlimited. There may be others. The 7720 provides two serial
447 ports, and the 7715 provides one serial and one standard PC parallel port.
448 Support for the 7715's parallel port is enabled by a separate option, which
449 will not appear unless parallel port support is first enabled at the top-level
450 of the Device Drivers config menu. Currently only compatibility mode is
451 supported on the parallel port (no ECP/EPP).
452
453 TODO:
454 - Implement ECP/EPP modes for the parallel port.
455 - Baud rates higher than 115200 are currently broken.
456 - Devices with a single serial port based on the Moschip MCS7703 may work
457 with this driver with a simple addition to the usb_device_id table. I
458 don't have one of these devices, so I can't say for sure.
438 459
439Generic Serial driver 460Generic Serial driver
440 461
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
index f11c583295e9..4739d5684305 100644
--- a/Documentation/video4linux/CARDLIST.bttv
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -100,7 +100,7 @@
100 99 -> AD-TVK503 100 99 -> AD-TVK503
101100 -> Hercules Smart TV Stereo 101100 -> Hercules Smart TV Stereo
102101 -> Pace TV & Radio Card 102101 -> Pace TV & Radio Card
103102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155] 103102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155,0800:a155,0801:a155,0802:a155,0803:a155]
104103 -> Grand X-Guard / Trust 814PCI [0304:0102] 104103 -> Grand X-Guard / Trust 814PCI [0304:0102]
105104 -> Nebula Electronics DigiTV [0071:0101] 105104 -> Nebula Electronics DigiTV [0071:0101]
106105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433] 106105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 7ec3c4e4b60f..f2510541373b 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -82,3 +82,4 @@
82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654] 82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654]
83 82 -> WinFast DTV2000 H rev. J [107d:6f2b] 83 82 -> WinFast DTV2000 H rev. J [107d:6f2b]
84 83 -> Prof 7301 DVB-S/S2 [b034:3034] 84 83 -> Prof 7301 DVB-S/S2 [b034:3034]
85 84 -> Samsung SMT 7020 DVB-S [18ac:dc00,18ac:dccd]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 0c166ff003a0..3a623aaeae5f 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -1,5 +1,5 @@
1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] 1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800]
2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868] 2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2863,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868]
3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] 3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036]
4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] 4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208]
5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] 5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201]
@@ -27,6 +27,7 @@
27 26 -> Hercules Smart TV USB 2.0 (em2820/em2840) 27 26 -> Hercules Smart TV USB 2.0 (em2820/em2840)
28 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840) 28 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840)
29 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840) 29 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840)
30 29 -> EM2860/TVP5150 Reference Design (em2860)
30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) 31 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840)
31 31 -> Usbgear VD204v9 (em2821) 32 31 -> Usbgear VD204v9 (em2821)
32 32 -> Supercomp USB 2.0 TV (em2821) 33 32 -> Supercomp USB 2.0 TV (em2821)
@@ -70,3 +71,4 @@
70 72 -> Gadmei UTV330+ (em2861) 71 72 -> Gadmei UTV330+ (em2861)
71 73 -> Reddo DVB-C USB TV Box (em2870) 72 73 -> Reddo DVB-C USB TV Box (em2870)
72 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800) 73 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800)
74 75 -> Dikom DK300 (em2882)
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index b4a767060ed7..070f2576707e 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -175,3 +175,6 @@
175174 -> Asus Europa Hybrid OEM [1043:4847] 175174 -> Asus Europa Hybrid OEM [1043:4847]
176175 -> Leadtek Winfast DTV1000S [107d:6655] 176175 -> Leadtek Winfast DTV1000S [107d:6655]
177176 -> Beholder BeholdTV 505 RDS [0000:5051] 177176 -> Beholder BeholdTV 505 RDS [0000:5051]
178177 -> Hawell HW-404M7
179179 -> Beholder BeholdTV H7 [5ace:7190]
180180 -> Beholder BeholdTV A7 [5ace:7090]
diff --git a/Documentation/video4linux/extract_xc3028.pl b/Documentation/video4linux/extract_xc3028.pl
index 2cb816047fc1..47877deae6d7 100644
--- a/Documentation/video4linux/extract_xc3028.pl
+++ b/Documentation/video4linux/extract_xc3028.pl
@@ -5,12 +5,18 @@
5# 5#
6# In order to use, you need to: 6# In order to use, you need to:
7# 1) Download the windows driver with something like: 7# 1) Download the windows driver with something like:
8# Version 2.4
9# wget http://www.twinhan.com/files/AW/BDA T/20080303_V1.0.6.7.zip
10# or wget http://www.stefanringel.de/pub/20080303_V1.0.6.7.zip
11# Version 2.7
8# wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip 12# wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip
9# 2) Extract the file hcw85bda.sys from the zip into the current dir: 13# 2) Extract the files from the zip into the current dir:
14# unzip -j 20080303_V1.0.6.7.zip 20080303_v1.0.6.7/UDXTTM6000.sys
10# unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys 15# unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys
11# 3) run the script: 16# 3) run the script:
12# ./extract_xc3028.pl 17# ./extract_xc3028.pl
13# 4) copy the generated file: 18# 4) copy the generated files:
19# cp xc3028-v24.fw /lib/firmware
14# cp xc3028-v27.fw /lib/firmware 20# cp xc3028-v27.fw /lib/firmware
15 21
16#use strict; 22#use strict;
@@ -135,7 +141,7 @@ sub write_hunk_fix_endian($$)
135 } 141 }
136} 142}
137 143
138sub main_firmware($$$$) 144sub main_firmware_24($$$$)
139{ 145{
140 my $out; 146 my $out;
141 my $j=0; 147 my $j=0;
@@ -146,8 +152,774 @@ sub main_firmware($$$$)
146 152
147 for ($j = length($name); $j <32; $j++) { 153 for ($j = length($name); $j <32; $j++) {
148 $name = $name.chr(0); 154 $name = $name.chr(0);
155 }
156
157 open OUTFILE, ">$outfile";
158 syswrite(OUTFILE, $name);
159 write_le16($version);
160 write_le16($nr_desc);
161
162 #
163 # Firmware 0, type: BASE FW F8MHZ (0x00000003), id: (0000000000000000), size: 6635
164 #
165
166 write_le32(0x00000003); # Type
167 write_le64(0x00000000, 0x00000000); # ID
168 write_le32(6635); # Size
169 write_hunk_fix_endian(257752, 6635);
170
171 #
172 # Firmware 1, type: BASE FW F8MHZ MTS (0x00000007), id: (0000000000000000), size: 6635
173 #
174
175 write_le32(0x00000007); # Type
176 write_le64(0x00000000, 0x00000000); # ID
177 write_le32(6635); # Size
178 write_hunk_fix_endian(264392, 6635);
179
180 #
181 # Firmware 2, type: BASE FW FM (0x00000401), id: (0000000000000000), size: 6525
182 #
183
184 write_le32(0x00000401); # Type
185 write_le64(0x00000000, 0x00000000); # ID
186 write_le32(6525); # Size
187 write_hunk_fix_endian(271040, 6525);
188
189 #
190 # Firmware 3, type: BASE FW FM INPUT1 (0x00000c01), id: (0000000000000000), size: 6539
191 #
192
193 write_le32(0x00000c01); # Type
194 write_le64(0x00000000, 0x00000000); # ID
195 write_le32(6539); # Size
196 write_hunk_fix_endian(277568, 6539);
197
198 #
199 # Firmware 4, type: BASE FW (0x00000001), id: (0000000000000000), size: 6633
200 #
201
202 write_le32(0x00000001); # Type
203 write_le64(0x00000000, 0x00000000); # ID
204 write_le32(6633); # Size
205 write_hunk_fix_endian(284120, 6633);
206
207 #
208 # Firmware 5, type: BASE FW MTS (0x00000005), id: (0000000000000000), size: 6617
209 #
210
211 write_le32(0x00000005); # Type
212 write_le64(0x00000000, 0x00000000); # ID
213 write_le32(6617); # Size
214 write_hunk_fix_endian(290760, 6617);
215
216 #
217 # Firmware 6, type: STD FW (0x00000000), id: PAL/BG A2/A (0000000100000007), size: 161
218 #
219
220 write_le32(0x00000000); # Type
221 write_le64(0x00000001, 0x00000007); # ID
222 write_le32(161); # Size
223 write_hunk_fix_endian(297384, 161);
224
225 #
226 # Firmware 7, type: STD FW MTS (0x00000004), id: PAL/BG A2/A (0000000100000007), size: 169
227 #
228
229 write_le32(0x00000004); # Type
230 write_le64(0x00000001, 0x00000007); # ID
231 write_le32(169); # Size
232 write_hunk_fix_endian(297552, 169);
233
234 #
235 # Firmware 8, type: STD FW (0x00000000), id: PAL/BG A2/B (0000000200000007), size: 161
236 #
237
238 write_le32(0x00000000); # Type
239 write_le64(0x00000002, 0x00000007); # ID
240 write_le32(161); # Size
241 write_hunk_fix_endian(297728, 161);
242
243 #
244 # Firmware 9, type: STD FW MTS (0x00000004), id: PAL/BG A2/B (0000000200000007), size: 169
245 #
246
247 write_le32(0x00000004); # Type
248 write_le64(0x00000002, 0x00000007); # ID
249 write_le32(169); # Size
250 write_hunk_fix_endian(297896, 169);
251
252 #
253 # Firmware 10, type: STD FW (0x00000000), id: PAL/BG NICAM/A (0000000400000007), size: 161
254 #
255
256 write_le32(0x00000000); # Type
257 write_le64(0x00000004, 0x00000007); # ID
258 write_le32(161); # Size
259 write_hunk_fix_endian(298072, 161);
260
261 #
262 # Firmware 11, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/A (0000000400000007), size: 169
263 #
264
265 write_le32(0x00000004); # Type
266 write_le64(0x00000004, 0x00000007); # ID
267 write_le32(169); # Size
268 write_hunk_fix_endian(298240, 169);
269
270 #
271 # Firmware 12, type: STD FW (0x00000000), id: PAL/BG NICAM/B (0000000800000007), size: 161
272 #
273
274 write_le32(0x00000000); # Type
275 write_le64(0x00000008, 0x00000007); # ID
276 write_le32(161); # Size
277 write_hunk_fix_endian(298416, 161);
278
279 #
280 # Firmware 13, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/B (0000000800000007), size: 169
281 #
282
283 write_le32(0x00000004); # Type
284 write_le64(0x00000008, 0x00000007); # ID
285 write_le32(169); # Size
286 write_hunk_fix_endian(298584, 169);
287
288 #
289 # Firmware 14, type: STD FW (0x00000000), id: PAL/DK A2 (00000003000000e0), size: 161
290 #
291
292 write_le32(0x00000000); # Type
293 write_le64(0x00000003, 0x000000e0); # ID
294 write_le32(161); # Size
295 write_hunk_fix_endian(298760, 161);
296
297 #
298 # Firmware 15, type: STD FW MTS (0x00000004), id: PAL/DK A2 (00000003000000e0), size: 169
299 #
300
301 write_le32(0x00000004); # Type
302 write_le64(0x00000003, 0x000000e0); # ID
303 write_le32(169); # Size
304 write_hunk_fix_endian(298928, 169);
305
306 #
307 # Firmware 16, type: STD FW (0x00000000), id: PAL/DK NICAM (0000000c000000e0), size: 161
308 #
309
310 write_le32(0x00000000); # Type
311 write_le64(0x0000000c, 0x000000e0); # ID
312 write_le32(161); # Size
313 write_hunk_fix_endian(299104, 161);
314
315 #
316 # Firmware 17, type: STD FW MTS (0x00000004), id: PAL/DK NICAM (0000000c000000e0), size: 169
317 #
318
319 write_le32(0x00000004); # Type
320 write_le64(0x0000000c, 0x000000e0); # ID
321 write_le32(169); # Size
322 write_hunk_fix_endian(299272, 169);
323
324 #
325 # Firmware 18, type: STD FW (0x00000000), id: SECAM/K1 (0000000000200000), size: 161
326 #
327
328 write_le32(0x00000000); # Type
329 write_le64(0x00000000, 0x00200000); # ID
330 write_le32(161); # Size
331 write_hunk_fix_endian(299448, 161);
332
333 #
334 # Firmware 19, type: STD FW MTS (0x00000004), id: SECAM/K1 (0000000000200000), size: 169
335 #
336
337 write_le32(0x00000004); # Type
338 write_le64(0x00000000, 0x00200000); # ID
339 write_le32(169); # Size
340 write_hunk_fix_endian(299616, 169);
341
342 #
343 # Firmware 20, type: STD FW (0x00000000), id: SECAM/K3 (0000000004000000), size: 161
344 #
345
346 write_le32(0x00000000); # Type
347 write_le64(0x00000000, 0x04000000); # ID
348 write_le32(161); # Size
349 write_hunk_fix_endian(299792, 161);
350
351 #
352 # Firmware 21, type: STD FW MTS (0x00000004), id: SECAM/K3 (0000000004000000), size: 169
353 #
354
355 write_le32(0x00000004); # Type
356 write_le64(0x00000000, 0x04000000); # ID
357 write_le32(169); # Size
358 write_hunk_fix_endian(299960, 169);
359
360 #
361 # Firmware 22, type: STD FW D2633 DTV6 ATSC (0x00010030), id: (0000000000000000), size: 149
362 #
363
364 write_le32(0x00010030); # Type
365 write_le64(0x00000000, 0x00000000); # ID
366 write_le32(149); # Size
367 write_hunk_fix_endian(300136, 149);
368
369 #
370 # Firmware 23, type: STD FW D2620 DTV6 QAM (0x00000068), id: (0000000000000000), size: 149
371 #
372
373 write_le32(0x00000068); # Type
374 write_le64(0x00000000, 0x00000000); # ID
375 write_le32(149); # Size
376 write_hunk_fix_endian(300296, 149);
377
378 #
379 # Firmware 24, type: STD FW D2633 DTV6 QAM (0x00000070), id: (0000000000000000), size: 149
380 #
381
382 write_le32(0x00000070); # Type
383 write_le64(0x00000000, 0x00000000); # ID
384 write_le32(149); # Size
385 write_hunk_fix_endian(300448, 149);
386
387 #
388 # Firmware 25, type: STD FW D2620 DTV7 (0x00000088), id: (0000000000000000), size: 149
389 #
390
391 write_le32(0x00000088); # Type
392 write_le64(0x00000000, 0x00000000); # ID
393 write_le32(149); # Size
394 write_hunk_fix_endian(300608, 149);
395
396 #
397 # Firmware 26, type: STD FW D2633 DTV7 (0x00000090), id: (0000000000000000), size: 149
398 #
399
400 write_le32(0x00000090); # Type
401 write_le64(0x00000000, 0x00000000); # ID
402 write_le32(149); # Size
403 write_hunk_fix_endian(300760, 149);
404
405 #
406 # Firmware 27, type: STD FW D2620 DTV78 (0x00000108), id: (0000000000000000), size: 149
407 #
408
409 write_le32(0x00000108); # Type
410 write_le64(0x00000000, 0x00000000); # ID
411 write_le32(149); # Size
412 write_hunk_fix_endian(300920, 149);
413
414 #
415 # Firmware 28, type: STD FW D2633 DTV78 (0x00000110), id: (0000000000000000), size: 149
416 #
417
418 write_le32(0x00000110); # Type
419 write_le64(0x00000000, 0x00000000); # ID
420 write_le32(149); # Size
421 write_hunk_fix_endian(301072, 149);
422
423 #
424 # Firmware 29, type: STD FW D2620 DTV8 (0x00000208), id: (0000000000000000), size: 149
425 #
426
427 write_le32(0x00000208); # Type
428 write_le64(0x00000000, 0x00000000); # ID
429 write_le32(149); # Size
430 write_hunk_fix_endian(301232, 149);
431
432 #
433 # Firmware 30, type: STD FW D2633 DTV8 (0x00000210), id: (0000000000000000), size: 149
434 #
435
436 write_le32(0x00000210); # Type
437 write_le64(0x00000000, 0x00000000); # ID
438 write_le32(149); # Size
439 write_hunk_fix_endian(301384, 149);
440
441 #
442 # Firmware 31, type: STD FW FM (0x00000400), id: (0000000000000000), size: 135
443 #
444
445 write_le32(0x00000400); # Type
446 write_le64(0x00000000, 0x00000000); # ID
447 write_le32(135); # Size
448 write_hunk_fix_endian(301554, 135);
449
450 #
451 # Firmware 32, type: STD FW (0x00000000), id: PAL/I (0000000000000010), size: 161
452 #
453
454 write_le32(0x00000000); # Type
455 write_le64(0x00000000, 0x00000010); # ID
456 write_le32(161); # Size
457 write_hunk_fix_endian(301688, 161);
458
459 #
460 # Firmware 33, type: STD FW MTS (0x00000004), id: PAL/I (0000000000000010), size: 169
461 #
462
463 write_le32(0x00000004); # Type
464 write_le64(0x00000000, 0x00000010); # ID
465 write_le32(169); # Size
466 write_hunk_fix_endian(301856, 169);
467
468 #
469 # Firmware 34, type: STD FW (0x00000000), id: SECAM/L AM (0000001000400000), size: 169
470 #
471
472 #
473 # Firmware 35, type: STD FW (0x00000000), id: SECAM/L NICAM (0000000c00400000), size: 161
474 #
475
476 write_le32(0x00000000); # Type
477 write_le64(0x0000000c, 0x00400000); # ID
478 write_le32(161); # Size
479 write_hunk_fix_endian(302032, 161);
480
481 #
482 # Firmware 36, type: STD FW (0x00000000), id: SECAM/Lc (0000000000800000), size: 161
483 #
484
485 write_le32(0x00000000); # Type
486 write_le64(0x00000000, 0x00800000); # ID
487 write_le32(161); # Size
488 write_hunk_fix_endian(302200, 161);
489
490 #
491 # Firmware 37, type: STD FW (0x00000000), id: NTSC/M Kr (0000000000008000), size: 161
492 #
493
494 write_le32(0x00000000); # Type
495 write_le64(0x00000000, 0x00008000); # ID
496 write_le32(161); # Size
497 write_hunk_fix_endian(302368, 161);
498
499 #
500 # Firmware 38, type: STD FW LCD (0x00001000), id: NTSC/M Kr (0000000000008000), size: 161
501 #
502
503 write_le32(0x00001000); # Type
504 write_le64(0x00000000, 0x00008000); # ID
505 write_le32(161); # Size
506 write_hunk_fix_endian(302536, 161);
507
508 #
509 # Firmware 39, type: STD FW LCD NOGD (0x00003000), id: NTSC/M Kr (0000000000008000), size: 161
510 #
511
512 write_le32(0x00003000); # Type
513 write_le64(0x00000000, 0x00008000); # ID
514 write_le32(161); # Size
515 write_hunk_fix_endian(302704, 161);
516
517 #
518 # Firmware 40, type: STD FW MTS (0x00000004), id: NTSC/M Kr (0000000000008000), size: 169
519 #
520
521 write_le32(0x00000004); # Type
522 write_le64(0x00000000, 0x00008000); # ID
523 write_le32(169); # Size
524 write_hunk_fix_endian(302872, 169);
525
526 #
527 # Firmware 41, type: STD FW (0x00000000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
528 #
529
530 write_le32(0x00000000); # Type
531 write_le64(0x00000000, 0x0000b700); # ID
532 write_le32(161); # Size
533 write_hunk_fix_endian(303048, 161);
534
535 #
536 # Firmware 42, type: STD FW LCD (0x00001000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
537 #
538
539 write_le32(0x00001000); # Type
540 write_le64(0x00000000, 0x0000b700); # ID
541 write_le32(161); # Size
542 write_hunk_fix_endian(303216, 161);
543
544 #
545 # Firmware 43, type: STD FW LCD NOGD (0x00003000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
546 #
547
548 write_le32(0x00003000); # Type
549 write_le64(0x00000000, 0x0000b700); # ID
550 write_le32(161); # Size
551 write_hunk_fix_endian(303384, 161);
552
553 #
554 # Firmware 44, type: STD FW (0x00000000), id: NTSC/M Jp (0000000000002000), size: 161
555 #
556
557 write_le32(0x00000000); # Type
558 write_le64(0x00000000, 0x00002000); # ID
559 write_le32(161); # Size
560 write_hunk_fix_endian(303552, 161);
561
562 #
563 # Firmware 45, type: STD FW MTS (0x00000004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
564 #
565
566 write_le32(0x00000004); # Type
567 write_le64(0x00000000, 0x0000b700); # ID
568 write_le32(169); # Size
569 write_hunk_fix_endian(303720, 169);
570
571 #
572 # Firmware 46, type: STD FW MTS LCD (0x00001004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
573 #
574
575 write_le32(0x00001004); # Type
576 write_le64(0x00000000, 0x0000b700); # ID
577 write_le32(169); # Size
578 write_hunk_fix_endian(303896, 169);
579
580 #
581 # Firmware 47, type: STD FW MTS LCD NOGD (0x00003004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
582 #
583
584 write_le32(0x00003004); # Type
585 write_le64(0x00000000, 0x0000b700); # ID
586 write_le32(169); # Size
587 write_hunk_fix_endian(304072, 169);
588
589 #
590 # Firmware 48, type: SCODE FW HAS IF (0x60000000), IF = 3.28 MHz id: (0000000000000000), size: 192
591 #
592
593 write_le32(0x60000000); # Type
594 write_le64(0x00000000, 0x00000000); # ID
595 write_le16(3280); # IF
596 write_le32(192); # Size
597 write_hunk(309048, 192);
598
599 #
600 # Firmware 49, type: SCODE FW HAS IF (0x60000000), IF = 3.30 MHz id: (0000000000000000), size: 192
601 #
602
603# write_le32(0x60000000); # Type
604# write_le64(0x00000000, 0x00000000); # ID
605# write_le16(3300); # IF
606# write_le32(192); # Size
607# write_hunk(304440, 192);
608
609 #
610 # Firmware 50, type: SCODE FW HAS IF (0x60000000), IF = 3.44 MHz id: (0000000000000000), size: 192
611 #
612
613 write_le32(0x60000000); # Type
614 write_le64(0x00000000, 0x00000000); # ID
615 write_le16(3440); # IF
616 write_le32(192); # Size
617 write_hunk(309432, 192);
618
619 #
620 # Firmware 51, type: SCODE FW HAS IF (0x60000000), IF = 3.46 MHz id: (0000000000000000), size: 192
621 #
622
623 write_le32(0x60000000); # Type
624 write_le64(0x00000000, 0x00000000); # ID
625 write_le16(3460); # IF
626 write_le32(192); # Size
627 write_hunk(309624, 192);
628
629 #
630 # Firmware 52, type: SCODE FW DTV6 ATSC OREN36 HAS IF (0x60210020), IF = 3.80 MHz id: (0000000000000000), size: 192
631 #
632
633 write_le32(0x60210020); # Type
634 write_le64(0x00000000, 0x00000000); # ID
635 write_le16(3800); # IF
636 write_le32(192); # Size
637 write_hunk(306936, 192);
638
639 #
640 # Firmware 53, type: SCODE FW HAS IF (0x60000000), IF = 4.00 MHz id: (0000000000000000), size: 192
641 #
642
643 write_le32(0x60000000); # Type
644 write_le64(0x00000000, 0x00000000); # ID
645 write_le16(4000); # IF
646 write_le32(192); # Size
647 write_hunk(309240, 192);
648
649 #
650 # Firmware 54, type: SCODE FW DTV6 ATSC TOYOTA388 HAS IF (0x60410020), IF = 4.08 MHz id: (0000000000000000), size: 192
651 #
652
653 write_le32(0x60410020); # Type
654 write_le64(0x00000000, 0x00000000); # ID
655 write_le16(4080); # IF
656 write_le32(192); # Size
657 write_hunk(307128, 192);
658
659 #
660 # Firmware 55, type: SCODE FW HAS IF (0x60000000), IF = 4.20 MHz id: (0000000000000000), size: 192
661 #
662
663 write_le32(0x60000000); # Type
664 write_le64(0x00000000, 0x00000000); # ID
665 write_le16(4200); # IF
666 write_le32(192); # Size
667 write_hunk(308856, 192);
668
669 #
670 # Firmware 56, type: SCODE FW MONO HAS IF (0x60008000), IF = 4.32 MHz id: NTSC/M Kr (0000000000008000), size: 192
671 #
672
673 write_le32(0x60008000); # Type
674 write_le64(0x00000000, 0x00008000); # ID
675 write_le16(4320); # IF
676 write_le32(192); # Size
677 write_hunk(305208, 192);
678
679 #
680 # Firmware 57, type: SCODE FW HAS IF (0x60000000), IF = 4.45 MHz id: (0000000000000000), size: 192
681 #
682
683 write_le32(0x60000000); # Type
684 write_le64(0x00000000, 0x00000000); # ID
685 write_le16(4450); # IF
686 write_le32(192); # Size
687 write_hunk(309816, 192);
688
689 #
690 # Firmware 58, type: SCODE FW MTS LCD NOGD MONO IF HAS IF (0x6002b004), IF = 4.50 MHz id: NTSC PAL/M PAL/N (000000000000b700), size: 192
691 #
692
693 write_le32(0x6002b004); # Type
694 write_le64(0x00000000, 0x0000b700); # ID
695 write_le16(4500); # IF
696 write_le32(192); # Size
697 write_hunk(304824, 192);
698
699 #
700 # Firmware 59, type: SCODE FW LCD NOGD IF HAS IF (0x60023000), IF = 4.60 MHz id: NTSC/M Kr (0000000000008000), size: 192
701 #
702
703 write_le32(0x60023000); # Type
704 write_le64(0x00000000, 0x00008000); # ID
705 write_le16(4600); # IF
706 write_le32(192); # Size
707 write_hunk(305016, 192);
708
709 #
710 # Firmware 60, type: SCODE FW DTV6 QAM DTV7 DTV78 DTV8 ZARLINK456 HAS IF (0x620003e0), IF = 4.76 MHz id: (0000000000000000), size: 192
711 #
712
713 write_le32(0x620003e0); # Type
714 write_le64(0x00000000, 0x00000000); # ID
715 write_le16(4760); # IF
716 write_le32(192); # Size
717 write_hunk(304440, 192);
718
719 #
720 # Firmware 61, type: SCODE FW HAS IF (0x60000000), IF = 4.94 MHz id: (0000000000000000), size: 192
721 #
722
723 write_le32(0x60000000); # Type
724 write_le64(0x00000000, 0x00000000); # ID
725 write_le16(4940); # IF
726 write_le32(192); # Size
727 write_hunk(308664, 192);
728
729 #
730 # Firmware 62, type: SCODE FW HAS IF (0x60000000), IF = 5.26 MHz id: (0000000000000000), size: 192
731 #
732
733 write_le32(0x60000000); # Type
734 write_le64(0x00000000, 0x00000000); # ID
735 write_le16(5260); # IF
736 write_le32(192); # Size
737 write_hunk(307704, 192);
738
739 #
740 # Firmware 63, type: SCODE FW MONO HAS IF (0x60008000), IF = 5.32 MHz id: PAL/BG A2 NICAM (0000000f00000007), size: 192
741 #
742
743 write_le32(0x60008000); # Type
744 write_le64(0x0000000f, 0x00000007); # ID
745 write_le16(5320); # IF
746 write_le32(192); # Size
747 write_hunk(307896, 192);
748
749 #
750 # Firmware 64, type: SCODE FW DTV7 DTV78 DTV8 DIBCOM52 CHINA HAS IF (0x65000380), IF = 5.40 MHz id: (0000000000000000), size: 192
751 #
752
753 write_le32(0x65000380); # Type
754 write_le64(0x00000000, 0x00000000); # ID
755 write_le16(5400); # IF
756 write_le32(192); # Size
757 write_hunk(304248, 192);
758
759 #
760 # Firmware 65, type: SCODE FW DTV6 ATSC OREN538 HAS IF (0x60110020), IF = 5.58 MHz id: (0000000000000000), size: 192
761 #
762
763 write_le32(0x60110020); # Type
764 write_le64(0x00000000, 0x00000000); # ID
765 write_le16(5580); # IF
766 write_le32(192); # Size
767 write_hunk(306744, 192);
768
769 #
770 # Firmware 66, type: SCODE FW HAS IF (0x60000000), IF = 5.64 MHz id: PAL/BG A2 (0000000300000007), size: 192
771 #
772
773 write_le32(0x60000000); # Type
774 write_le64(0x00000003, 0x00000007); # ID
775 write_le16(5640); # IF
776 write_le32(192); # Size
777 write_hunk(305592, 192);
778
779 #
780 # Firmware 67, type: SCODE FW HAS IF (0x60000000), IF = 5.74 MHz id: PAL/BG NICAM (0000000c00000007), size: 192
781 #
782
783 write_le32(0x60000000); # Type
784 write_le64(0x0000000c, 0x00000007); # ID
785 write_le16(5740); # IF
786 write_le32(192); # Size
787 write_hunk(305784, 192);
788
789 #
790 # Firmware 68, type: SCODE FW HAS IF (0x60000000), IF = 5.90 MHz id: (0000000000000000), size: 192
791 #
792
793 write_le32(0x60000000); # Type
794 write_le64(0x00000000, 0x00000000); # ID
795 write_le16(5900); # IF
796 write_le32(192); # Size
797 write_hunk(307512, 192);
798
799 #
800 # Firmware 69, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.00 MHz id: PAL/DK PAL/I SECAM/K3 SECAM/L SECAM/Lc NICAM (0000000c04c000f0), size: 192
801 #
802
803 write_le32(0x60008000); # Type
804 write_le64(0x0000000c, 0x04c000f0); # ID
805 write_le16(6000); # IF
806 write_le32(192); # Size
807 write_hunk(305576, 192);
808
809 #
810 # Firmware 70, type: SCODE FW DTV6 QAM ATSC LG60 F6MHZ HAS IF (0x68050060), IF = 6.20 MHz id: (0000000000000000), size: 192
811 #
812
813 write_le32(0x68050060); # Type
814 write_le64(0x00000000, 0x00000000); # ID
815 write_le16(6200); # IF
816 write_le32(192); # Size
817 write_hunk(306552, 192);
818
819 #
820 # Firmware 71, type: SCODE FW HAS IF (0x60000000), IF = 6.24 MHz id: PAL/I (0000000000000010), size: 192
821 #
822
823 write_le32(0x60000000); # Type
824 write_le64(0x00000000, 0x00000010); # ID
825 write_le16(6240); # IF
826 write_le32(192); # Size
827 write_hunk(305400, 192);
828
829 #
830 # Firmware 72, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.32 MHz id: SECAM/K1 (0000000000200000), size: 192
831 #
832
833 write_le32(0x60008000); # Type
834 write_le64(0x00000000, 0x00200000); # ID
835 write_le16(6320); # IF
836 write_le32(192); # Size
837 write_hunk(308472, 192);
838
839 #
840 # Firmware 73, type: SCODE FW HAS IF (0x60000000), IF = 6.34 MHz id: SECAM/K1 (0000000000200000), size: 192
841 #
842
843 write_le32(0x60000000); # Type
844 write_le64(0x00000000, 0x00200000); # ID
845 write_le16(6340); # IF
846 write_le32(192); # Size
847 write_hunk(306360, 192);
848
849 #
850 # Firmware 74, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.50 MHz id: PAL/DK SECAM/K3 SECAM/L NICAM (0000000c044000e0), size: 192
851 #
852
853 write_le32(0x60008000); # Type
854 write_le64(0x0000000c, 0x044000e0); # ID
855 write_le16(6500); # IF
856 write_le32(192); # Size
857 write_hunk(308280, 192);
858
859 #
860 # Firmware 75, type: SCODE FW DTV6 ATSC ATI638 HAS IF (0x60090020), IF = 6.58 MHz id: (0000000000000000), size: 192
861 #
862
863 write_le32(0x60090020); # Type
864 write_le64(0x00000000, 0x00000000); # ID
865 write_le16(6580); # IF
866 write_le32(192); # Size
867 write_hunk(304632, 192);
868
869 #
870 # Firmware 76, type: SCODE FW HAS IF (0x60000000), IF = 6.60 MHz id: PAL/DK A2 (00000003000000e0), size: 192
871 #
872
873 write_le32(0x60000000); # Type
874 write_le64(0x00000003, 0x000000e0); # ID
875 write_le16(6600); # IF
876 write_le32(192); # Size
877 write_hunk(306168, 192);
878
879 #
880 # Firmware 77, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.68 MHz id: PAL/DK A2 (00000003000000e0), size: 192
881 #
882
883 write_le32(0x60008000); # Type
884 write_le64(0x00000003, 0x000000e0); # ID
885 write_le16(6680); # IF
886 write_le32(192); # Size
887 write_hunk(308088, 192);
888
889 #
890 # Firmware 78, type: SCODE FW DTV6 ATSC TOYOTA794 HAS IF (0x60810020), IF = 8.14 MHz id: (0000000000000000), size: 192
891 #
892
893 write_le32(0x60810020); # Type
894 write_le64(0x00000000, 0x00000000); # ID
895 write_le16(8140); # IF
896 write_le32(192); # Size
897 write_hunk(307320, 192);
898
899 #
900 # Firmware 79, type: SCODE FW HAS IF (0x60000000), IF = 8.20 MHz id: (0000000000000000), size: 192
901 #
902
903# write_le32(0x60000000); # Type
904# write_le64(0x00000000, 0x00000000); # ID
905# write_le16(8200); # IF
906# write_le32(192); # Size
907# write_hunk(308088, 192);
149} 908}
150 909
910sub main_firmware_27($$$$)
911{
912 my $out;
913 my $j=0;
914 my $outfile = shift;
915 my $name = shift;
916 my $version = shift;
917 my $nr_desc = shift;
918
919 for ($j = length($name); $j <32; $j++) {
920 $name = $name.chr(0);
921 }
922
151 open OUTFILE, ">$outfile"; 923 open OUTFILE, ">$outfile";
152 syswrite(OUTFILE, $name); 924 syswrite(OUTFILE, $name);
153 write_le16($version); 925 write_le16($version);
@@ -906,20 +1678,39 @@ sub main_firmware($$$$)
906 write_hunk(812856, 192); 1678 write_hunk(812856, 192);
907} 1679}
908 1680
1681
909sub extract_firmware { 1682sub extract_firmware {
910 my $sourcefile = "hcw85bda.sys"; 1683 my $sourcefile_24 = "UDXTTM6000.sys";
911 my $hash = "0e44dbf63bb0169d57446aec21881ff2"; 1684 my $hash_24 = "cb9deb5508a5e150af2880f5b0066d78";
912 my $outfile = "xc3028-v27.fw"; 1685 my $outfile_24 = "xc3028-v24.fw";
913 my $name = "xc2028 firmware"; 1686 my $name_24 = "xc2028 firmware";
914 my $version = 519; 1687 my $version_24 = 516;
915 my $nr_desc = 80; 1688 my $nr_desc_24 = 77;
1689 my $out;
1690
1691 my $sourcefile_27 = "hcw85bda.sys";
1692 my $hash_27 = "0e44dbf63bb0169d57446aec21881ff2";
1693 my $outfile_27 = "xc3028-v27.fw";
1694 my $name_27 = "xc2028 firmware";
1695 my $version_27 = 519;
1696 my $nr_desc_27 = 80;
916 my $out; 1697 my $out;
917 1698
918 verify($sourcefile, $hash); 1699 if (-e $sourcefile_24) {
1700 verify($sourcefile_24, $hash_24);
1701
1702 open INFILE, "<$sourcefile_24";
1703 main_firmware_24($outfile_24, $name_24, $version_24, $nr_desc_24);
1704 close INFILE;
1705 }
919 1706
920 open INFILE, "<$sourcefile"; 1707 if (-e $sourcefile_27) {
921 main_firmware($outfile, $name, $version, $nr_desc); 1708 verify($sourcefile_27, $hash_27);
922 close INFILE; 1709
1710 open INFILE, "<$sourcefile_27";
1711 main_firmware_27($outfile_27, $name_27, $version_27, $nr_desc_27);
1712 close INFILE;
1713 }
923} 1714}
924 1715
925extract_firmware; 1716extract_firmware;
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 181b9e6fd984..8f3f5d33327c 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -50,6 +50,8 @@ zc3xx 0458:700f Genius VideoCam Web V2
50sonixj 0458:7025 Genius Eye 311Q 50sonixj 0458:7025 Genius Eye 311Q
51sn9c20x 0458:7029 Genius Look 320s 51sn9c20x 0458:7029 Genius Look 320s
52sonixj 0458:702e Genius Slim 310 NB 52sonixj 0458:702e Genius Slim 310 NB
53sn9c20x 0458:704a Genius Slim 1320
54sn9c20x 0458:704c Genius i-Look 1321
53sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650) 55sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650)
54sonixj 045e:00f5 MicroSoft VX3000 56sonixj 045e:00f5 MicroSoft VX3000
55sonixj 045e:00f7 MicroSoft VX1000 57sonixj 045e:00f7 MicroSoft VX1000
@@ -305,12 +307,14 @@ sonixj 0c45:6138 Sn9c120 Mo4000
305sonixj 0c45:613a Microdia Sonix PC Camera 307sonixj 0c45:613a Microdia Sonix PC Camera
306sonixj 0c45:613b Surfer SN-206 308sonixj 0c45:613b Surfer SN-206
307sonixj 0c45:613c Sonix Pccam168 309sonixj 0c45:613c Sonix Pccam168
310sonixj 0c45:6142 Hama PC-Webcam AC-150
308sonixj 0c45:6143 Sonix Pccam168 311sonixj 0c45:6143 Sonix Pccam168
309sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia 312sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia
310sonixj 0c45:614a Frontech E-Ccam (JIL-2225) 313sonixj 0c45:614a Frontech E-Ccam (JIL-2225)
311sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001) 314sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001)
312sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111) 315sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111)
313sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655) 316sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655)
317sn9c20x 0c45:624c PC Camera (SN9C201 + MT9M112)
314sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968) 318sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968)
315sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650) 319sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650)
316sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650) 320sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650)
@@ -323,6 +327,7 @@ sn9c20x 0c45:627f PC Camera (SN9C201 + OV9650)
323sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001) 327sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001)
324sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111) 328sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111)
325sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655) 329sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655)
330sn9c20x 0c45:628c PC Camera (SN9C201 + MT9M112)
326sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968) 331sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968)
327sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650) 332sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650)
328sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670) 333sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670)
diff --git a/Documentation/video4linux/sh_mobile_ceu_camera.txt b/Documentation/video4linux/sh_mobile_ceu_camera.txt
index 2ae16349a78d..cb47e723af74 100644
--- a/Documentation/video4linux/sh_mobile_ceu_camera.txt
+++ b/Documentation/video4linux/sh_mobile_ceu_camera.txt
@@ -17,18 +17,18 @@ Generic scaling / cropping scheme
17-2-- -\ 17-2-- -\
18| --\ 18| --\
19| --\ 19| --\
20+-5-- -\ -- -3-- 20+-5-- . -- -3-- -\
21| ---\ 21| `... -\
22| --- -4-- -\ 22| `... -4-- . - -7..
23| -\ 23| `.
24| - -6-- 24| `. .6--
25| 25|
26| - -6'- 26| . .6'-
27| -/ 27| .´
28| --- -4'- -/ 28| ... -4'- .´
29| ---/ 29| ...´ - -7'.
30+-5'- -/ 30+-5'- .´ -/
31| -- -3'- 31| -- -3'- -/
32| --/ 32| --/
33| --/ 33| --/
34-2'- -/ 34-2'- -/
@@ -36,7 +36,11 @@ Generic scaling / cropping scheme
36| 36|
37-1'- 37-1'-
38 38
39Produced by user requests: 39In the above chart minuses and slashes represent "real" data amounts, points and
40accents represent "useful" data, basically, CEU scaled amd cropped output,
41mapped back onto the client's source plane.
42
43Such a configuration can be produced by user requests:
40 44
41S_CROP(left / top = (5) - (1), width / height = (5') - (5)) 45S_CROP(left / top = (5) - (1), width / height = (5') - (5))
42S_FMT(width / height = (6') - (6)) 46S_FMT(width / height = (6') - (6))
@@ -106,52 +110,30 @@ window:
106S_CROP 110S_CROP
107------ 111------
108 112
109If old scale applied to new crop is invalid produce nearest new scale possible 113The API at http://v4l2spec.bytesex.org/spec/x1904.htm says:
110
1111. Calculate current combined scales.
112
113 scale_comb = (((4') - (4)) / ((6') - (6))) * (((2') - (2)) / ((3') - (3)))
114
1152. Apply iterative sensor S_CROP for new input window.
116
1173. If old combined scales applied to new crop produce an impossible user window,
118adjust scales to produce nearest possible window.
119
120 width_u_out = ((5') - (5)) / scale_comb
121 114
122 if (width_u_out > max) 115"...specification does not define an origin or units. However by convention
123 scale_comb = ((5') - (5)) / max; 116drivers should horizontally count unscaled samples relative to 0H."
124 else if (width_u_out < min)
125 scale_comb = ((5') - (5)) / min;
126 117
1274. Issue G_CROP to retrieve actual input window. 118We choose to follow the advise and interpret cropping units as client input
119pixels.
128 120
1295. Using actual input window and calculated combined scales calculate sensor 121Cropping is performed in the following 6 steps:
130target output window.
131
132 width_s_out = ((3') - (3)) = ((2') - (2)) / scale_comb
133
1346. Apply iterative S_FMT for new sensor target output window.
135
1367. Issue G_FMT to retrieve the actual sensor output window.
137
1388. Calculate sensor scales.
139
140 scale_s = ((3') - (3)) / ((2') - (2))
141 122
1429. Calculate sensor output subwindow to be cropped on CEU by applying sensor 1231. Request exactly user rectangle from the sensor.
143scales to the requested window.
144 124
145 width_ceu = ((5') - (5)) / scale_s 1252. If smaller - iterate until a larger one is obtained. Result: sensor cropped
126 to 2 : 2', target crop 5 : 5', current output format 6' - 6.
146 127
14710. Use CEU cropping for above calculated window. 1283. In the previous step the sensor has tried to preserve its output frame as
129 good as possible, but it could have changed. Retrieve it again.
148 130
14911. Calculate CEU scales from sensor scales from results of (10) and user window 1314. Sensor scaled to 3 : 3'. Sensor's scale is (2' - 2) / (3' - 3). Calculate
150from (3) 132 intermediate window: 4' - 4 = (5' - 5) * (3' - 3) / (2' - 2)
151 133
152 scale_ceu = calc_scale(((5') - (5)), &width_u_out) 1345. Calculate and apply host scale = (6' - 6) / (4' - 4)
153 135
15412. Apply CEU scales. 1366. Calculate and apply host crop: 6 - 7 = (5 - 2) * (6' - 6) / (5' - 5)
155 137
156-- 138--
157Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de> 139Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 5155700c206b..e831aaca66f8 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -545,12 +545,11 @@ unregister them:
545This will remove the device nodes from sysfs (causing udev to remove them 545This will remove the device nodes from sysfs (causing udev to remove them
546from /dev). 546from /dev).
547 547
548After video_unregister_device() returns no new opens can be done. 548After video_unregister_device() returns no new opens can be done. However,
549 549in the case of USB devices some application might still have one of these
550However, in the case of USB devices some application might still have one 550device nodes open. So after the unregister all file operations will return
551of these device nodes open. You should block all new accesses to read, 551an error as well, except for the ioctl and unlocked_ioctl file operations:
552write, poll, etc. except possibly for certain ioctl operations like 552those will still be passed on since some buffer ioctls may still be needed.
553queueing buffers.
554 553
555When the last user of the video device node exits, then the vdev->release() 554When the last user of the video device node exits, then the vdev->release()
556callback is called and you can do the final cleanup there. 555callback is called and you can do the final cleanup there.
@@ -609,3 +608,135 @@ scatter/gather method (videobuf-dma-sg), DMA with linear access
609 608
610Please see Documentation/video4linux/videobuf for more information on how 609Please see Documentation/video4linux/videobuf for more information on how
611to use the videobuf layer. 610to use the videobuf layer.
611
612struct v4l2_fh
613--------------
614
615struct v4l2_fh provides a way to easily keep file handle specific data
616that is used by the V4L2 framework. Using v4l2_fh is optional for
617drivers.
618
619The users of v4l2_fh (in the V4L2 framework, not the driver) know
620whether a driver uses v4l2_fh as its file->private_data pointer by
621testing the V4L2_FL_USES_V4L2_FH bit in video_device->flags.
622
623Useful functions:
624
625- v4l2_fh_init()
626
627 Initialise the file handle. This *MUST* be performed in the driver's
628 v4l2_file_operations->open() handler.
629
630- v4l2_fh_add()
631
632 Add a v4l2_fh to video_device file handle list. May be called after
633 initialising the file handle.
634
635- v4l2_fh_del()
636
637 Unassociate the file handle from video_device(). The file handle
638 exit function may now be called.
639
640- v4l2_fh_exit()
641
642 Uninitialise the file handle. After uninitialisation the v4l2_fh
643 memory can be freed.
644
645struct v4l2_fh is allocated as a part of the driver's own file handle
646structure and is set to file->private_data in the driver's open
647function by the driver. Drivers can extract their own file handle
648structure by using the container_of macro. Example:
649
650struct my_fh {
651 int blah;
652 struct v4l2_fh fh;
653};
654
655...
656
657int my_open(struct file *file)
658{
659 struct my_fh *my_fh;
660 struct video_device *vfd;
661 int ret;
662
663 ...
664
665 ret = v4l2_fh_init(&my_fh->fh, vfd);
666 if (ret)
667 return ret;
668
669 v4l2_fh_add(&my_fh->fh);
670
671 file->private_data = &my_fh->fh;
672
673 ...
674}
675
676int my_release(struct file *file)
677{
678 struct v4l2_fh *fh = file->private_data;
679 struct my_fh *my_fh = container_of(fh, struct my_fh, fh);
680
681 ...
682}
683
684V4L2 events
685-----------
686
687The V4L2 events provide a generic way to pass events to user space.
688The driver must use v4l2_fh to be able to support V4L2 events.
689
690Useful functions:
691
692- v4l2_event_alloc()
693
694 To use events, the driver must allocate events for the file handle. By
695 calling the function more than once, the driver may assure that at least n
696 events in total have been allocated. The function may not be called in
697 atomic context.
698
699- v4l2_event_queue()
700
701 Queue events to video device. The driver's only responsibility is to fill
702 in the type and the data fields. The other fields will be filled in by
703 V4L2.
704
705- v4l2_event_subscribe()
706
707 The video_device->ioctl_ops->vidioc_subscribe_event must check the driver
708 is able to produce events with specified event id. Then it calls
709 v4l2_event_subscribe() to subscribe the event.
710
711- v4l2_event_unsubscribe()
712
713 vidioc_unsubscribe_event in struct v4l2_ioctl_ops. A driver may use
714 v4l2_event_unsubscribe() directly unless it wants to be involved in
715 unsubscription process.
716
717 The special type V4L2_EVENT_ALL may be used to unsubscribe all events. The
718 drivers may want to handle this in a special way.
719
720- v4l2_event_pending()
721
722 Returns the number of pending events. Useful when implementing poll.
723
724Drivers do not initialise events directly. The events are initialised
725through v4l2_fh_init() if video_device->ioctl_ops->vidioc_subscribe_event is
726non-NULL. This *MUST* be performed in the driver's
727v4l2_file_operations->open() handler.
728
729Events are delivered to user space through the poll system call. The driver
730can use v4l2_fh->events->wait wait_queue_head_t as the argument for
731poll_wait().
732
733There are standard and private events. New standard events must use the
734smallest available event type. The drivers must allocate their events from
735their own class starting from class base. Class base is
736V4L2_EVENT_PRIVATE_START + n * 1000 where n is the lowest available number.
737The first event type in the class is reserved for future use, so the first
738available event type is 'class base + 1'.
739
740An example on how the V4L2 events may be used can be found in the OMAP
7413 ISP driver available at <URL:http://gitorious.org/omap3camera> as of
742writing this.
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
index 9969c7d9f985..eda1a6d3578a 100644
--- a/Documentation/vm/map_hugetlb.c
+++ b/Documentation/vm/map_hugetlb.c
@@ -19,7 +19,7 @@
19#define PROTECTION (PROT_READ | PROT_WRITE) 19#define PROTECTION (PROT_READ | PROT_WRITE)
20 20
21#ifndef MAP_HUGETLB 21#ifndef MAP_HUGETLB
22#define MAP_HUGETLB 0x40 22#define MAP_HUGETLB 0x40000 /* arch specific */
23#endif 23#endif
24 24
25/* Only ia64 requires this */ 25/* Only ia64 requires this */
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index e93ad9425e2a..a200a386429d 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -1,41 +1,149 @@
1Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> 1Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
2 2
3The intent of this file is to have an uptodate, running commentary 3What is NUMA?
4from different people about NUMA specific code in the Linux vm. 4
5 5This question can be answered from a couple of perspectives: the
6What is NUMA? It is an architecture where the memory access times 6hardware view and the Linux software view.
7for different regions of memory from a given processor varies 7
8according to the "distance" of the memory region from the processor. 8From the hardware perspective, a NUMA system is a computer platform that
9Each region of memory to which access times are the same from any 9comprises multiple components or assemblies each of which may contain 0
10cpu, is called a node. On such architectures, it is beneficial if 10or more CPUs, local memory, and/or IO buses. For brevity and to
11the kernel tries to minimize inter node communications. Schemes 11disambiguate the hardware view of these physical components/assemblies
12for this range from kernel text and read-only data replication 12from the software abstraction thereof, we'll call the components/assemblies
13across nodes, and trying to house all the data structures that 13'cells' in this document.
14key components of the kernel need on memory on that node. 14
15 15Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
16Currently, all the numa support is to provide efficient handling 16of the system--although some components necessary for a stand-alone SMP system
17of widely discontiguous physical memory, so architectures which 17may not be populated on any given cell. The cells of the NUMA system are
18are not NUMA but can have huge holes in the physical address space 18connected together with some sort of system interconnect--e.g., a crossbar or
19can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. 19point-to-point link are common types of NUMA system interconnects. Both of
20 20these types of interconnects can be aggregated to create NUMA platforms with
21The initial port includes NUMAizing the bootmem allocator code by 21cells at multiple distances from other cells.
22encapsulating all the pieces of information into a bootmem_data_t 22
23structure. Node specific calls have been added to the allocator. 23For Linux, the NUMA platforms of interest are primarily what is known as Cache
24In theory, any platform which uses the bootmem allocator should 24Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible
25be able to put the bootmem and mem_map data structures anywhere 25to and accessible from any CPU attached to any cell and cache coherency
26it deems best. 26is handled in hardware by the processor caches and/or the system interconnect.
27 27
28Each node's page allocation data structures have also been encapsulated 28Memory access time and effective memory bandwidth varies depending on how far
29into a pg_data_t. The bootmem_data_t is just one part of this. To 29away the cell containing the CPU or IO bus making the memory access is from the
30make the code look uniform between NUMA and regular UMA platforms, 30cell containing the target memory. For example, access to memory by CPUs
31UMA platforms have a statically allocated pg_data_t too (contig_page_data). 31attached to the same cell will experience faster access times and higher
32For the sake of uniformity, the function num_online_nodes() is also defined 32bandwidths than accesses to memory on other, remote cells. NUMA platforms
33for all platforms. As we run benchmarks, we might decide to NUMAize 33can have cells at multiple remote distances from any given cell.
34more variables like low_on_memory, nr_free_pages etc into the pg_data_t. 34
35 35Platform vendors don't build NUMA systems just to make software developers'
36The NUMA aware page allocation code currently tries to allocate pages 36lives interesting. Rather, this architecture is a means to provide scalable
37from different nodes in a round robin manner. This will be changed to 37memory bandwidth. However, to achieve scalable memory bandwidth, system and
38do concentratic circle search, starting from current node, once the 38application software must arrange for a large majority of the memory references
39NUMA port achieves more maturity. The call alloc_pages_node has been 39[cache misses] to be to "local" memory--memory on the same cell, if any--or
40added, so that drivers can make the call and not worry about whether 40to the closest cell with memory.
41it is running on a NUMA or UMA platform. 41
42This leads to the Linux software view of a NUMA system:
43
44Linux divides the system's hardware resources into multiple software
45abstractions called "nodes". Linux maps the nodes onto the physical cells
46of the hardware platform, abstracting away some of the details for some
47architectures. As with physical cells, software nodes may contain 0 or more
48CPUs, memory and/or IO buses. And, again, memory accesses to memory on
49"closer" nodes--nodes that map to closer cells--will generally experience
50faster access times and higher effective bandwidth than accesses to more
51remote cells.
52
53For some architectures, such as x86, Linux will "hide" any node representing a
54physical cell that has no memory attached, and reassign any CPUs attached to
55that cell to a node representing a cell that does have memory. Thus, on
56these architectures, one cannot assume that all CPUs that Linux associates with
57a given node will see the same local memory access times and bandwidth.
58
59In addition, for some architectures, again x86 is an example, Linux supports
60the emulation of additional nodes. For NUMA emulation, linux will carve up
61the existing nodes--or the system memory for non-NUMA platforms--into multiple
62nodes. Each emulated node will manage a fraction of the underlying cells'
63physical memory. NUMA emluation is useful for testing NUMA kernel and
64application features on non-NUMA platforms, and as a sort of memory resource
65management mechanism when used together with cpusets.
66[see Documentation/cgroups/cpusets.txt]
67
68For each node with memory, Linux constructs an independent memory management
69subsystem, complete with its own free page lists, in-use page lists, usage
70statistics and locks to mediate access. In addition, Linux constructs for
71each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
72an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a
73selected zone/node cannot satisfy the allocation request. This situation,
74when a zone has no available memory to satisfy a request, is called
75"overflow" or "fallback".
76
77Because some nodes contain multiple zones containing different types of
78memory, Linux must decide whether to order the zonelists such that allocations
79fall back to the same zone type on a different node, or to a different zone
80type on the same node. This is an important consideration because some zones,
81such as DMA or DMA32, represent relatively scarce resources. Linux chooses
82a default zonelist order based on the sizes of the various zone types relative
83to the total memory of the node and the total memory of the system. The
84default zonelist order may be overridden using the numa_zonelist_order kernel
85boot parameter or sysctl. [see Documentation/kernel-parameters.txt and
86Documentation/sysctl/vm.txt]
87
88By default, Linux will attempt to satisfy memory allocation requests from the
89node to which the CPU that executes the request is assigned. Specifically,
90Linux will attempt to allocate from the first node in the appropriate zonelist
91for the node where the request originates. This is called "local allocation."
92If the "local" node cannot satisfy the request, the kernel will examine other
93nodes' zones in the selected zonelist looking for the first zone in the list
94that can satisfy the request.
95
96Local allocation will tend to keep subsequent access to the allocated memory
97"local" to the underlying physical resources and off the system interconnect--
98as long as the task on whose behalf the kernel allocated some memory does not
99later migrate away from that memory. The Linux scheduler is aware of the
100NUMA topology of the platform--embodied in the "scheduling domains" data
101structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler
102attempts to minimize task migration to distant scheduling domains. However,
103the scheduler does not take a task's NUMA footprint into account directly.
104Thus, under sufficient imbalance, tasks can migrate between nodes, remote
105from their initial node and kernel data structures.
106
107System administrators and application designers can restrict a task's migration
108to improve NUMA locality using various CPU affinity command line interfaces,
109such as taskset(1) and numactl(1), and program interfaces such as
110sched_setaffinity(2). Further, one can modify the kernel's default local
111allocation behavior using Linux NUMA memory policy.
112[see Documentation/vm/numa_memory_policy.]
113
114System administrators can restrict the CPUs and nodes' memories that a non-
115privileged user can specify in the scheduling or NUMA commands and functions
116using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt]
117
118On architectures that do not hide memoryless nodes, Linux will include only
119zones [nodes] with memory in the zonelists. This means that for a memoryless
120node the "local memory node"--the node of the first zone in CPU's node's
121zonelist--will not be the node itself. Rather, it will be the node that the
122kernel selected as the nearest node with memory when it built the zonelists.
123So, default, local allocations will succeed with the kernel supplying the
124closest available memory. This is a consequence of the same mechanism that
125allows such allocations to fallback to other nearby nodes when a node that
126does contain memory overflows.
127
128Some kernel allocations do not want or cannot tolerate this allocation fallback
129behavior. Rather they want to be sure they get memory from the specified node
130or get notified that the node has no free memory. This is usually the case when
131a subsystem allocates per CPU memory resources, for example.
132
133A typical model for making such an allocation is to obtain the node id of the
134node to which the "current CPU" is attached using one of the kernel's
135numa_node_id() or CPU_to_node() functions and then request memory from only
136the node id returned. When such an allocation fails, the requesting subsystem
137may revert to its own fallback path. The slab kernel memory allocator is an
138example of this. Or, the subsystem may choose to disable or not to enable
139itself on allocation failure. The kernel profiling subsystem is an example of
140this.
141
142If the architecture supports--does not hide--memoryless nodes, then CPUs
143attached to memoryless nodes would always incur the fallback path overhead
144or some subsystems would fail to initialize if they attempted to allocated
145memory exclusively from a node without memory. To support such
146architectures transparently, kernel subsystems can use the numa_mem_id()
147or cpu_to_mem() function to locate the "local memory node" for the calling or
148specified CPU. Again, this is the same node from which default, local page
149allocations will be attempted.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index be45dbb9d7f2..6690fc34ef6d 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -45,7 +45,7 @@ most general to most specific:
45 to establish the task policy for a child task exec()'d from an 45 to establish the task policy for a child task exec()'d from an
46 executable image that has no awareness of memory policy. See the 46 executable image that has no awareness of memory policy. See the
47 MEMORY POLICY APIS section, below, for an overview of the system call 47 MEMORY POLICY APIS section, below, for an overview of the system call
48 that a task may use to set/change it's task/process policy. 48 that a task may use to set/change its task/process policy.
49 49
50 In a multi-threaded task, task policies apply only to the thread 50 In a multi-threaded task, task policies apply only to the thread
51 [Linux kernel task] that installs the policy and any threads 51 [Linux kernel task] that installs the policy and any threads
@@ -301,7 +301,7 @@ decrement this reference count, respectively. mpol_put() will only free
301the structure back to the mempolicy kmem cache when the reference count 301the structure back to the mempolicy kmem cache when the reference count
302goes to zero. 302goes to zero.
303 303
304When a new memory policy is allocated, it's reference count is initialized 304When a new memory policy is allocated, its reference count is initialized
305to '1', representing the reference held by the task that is installing the 305to '1', representing the reference held by the task that is installing the
306new policy. When a pointer to a memory policy structure is stored in another 306new policy. When a pointer to a memory policy structure is stored in another
307structure, another reference is added, as the task's reference will be dropped 307structure, another reference is added, as the task's reference will be dropped
diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic
index e3333eec4320..212f4ac31c01 100644
--- a/Documentation/w1/w1.generic
+++ b/Documentation/w1/w1.generic
@@ -25,7 +25,7 @@ When a w1 master driver registers with the w1 subsystem, the following occurs:
25 - sysfs entries for that w1 master are created 25 - sysfs entries for that w1 master are created
26 - the w1 bus is periodically searched for new slave devices 26 - the w1 bus is periodically searched for new slave devices
27 27
28When a device is found on the bus, w1 core checks if driver for it's family is 28When a device is found on the bus, w1 core checks if driver for its family is
29loaded. If so, the family driver is attached to the slave. 29loaded. If so, the family driver is attached to the slave.
30If there is no driver for the family, default one is assigned, which allows to perform 30If there is no driver for the family, default one is assigned, which allows to perform
31almost any kind of operations. Each logical operation is a transaction 31almost any kind of operations. Each logical operation is a transaction
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX
index c3ea47e507fe..ee994513a9b1 100644
--- a/Documentation/watchdog/00-INDEX
+++ b/Documentation/watchdog/00-INDEX
@@ -1,10 +1,15 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3hpwdt.txt
4 - information on the HP iLO2 NMI watchdog
3pcwd-watchdog.txt 5pcwd-watchdog.txt
4 - documentation for Berkshire Products PC Watchdog ISA cards. 6 - documentation for Berkshire Products PC Watchdog ISA cards.
5src/ 7src/
6 - directory holding watchdog related example programs. 8 - directory holding watchdog related example programs.
7watchdog-api.txt 9watchdog-api.txt
8 - description of the Linux Watchdog driver API. 10 - description of the Linux Watchdog driver API.
11watchdog-parameters.txt
12 - information on driver parameters (for drivers other than
13 the ones that have driver-specific files here)
9wdt.txt 14wdt.txt
10 - description of the Watchdog Timer Interfaces for Linux. 15 - description of the Watchdog Timer Interfaces for Linux.
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt
new file mode 100644
index 000000000000..41c95cc1dc1f
--- /dev/null
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -0,0 +1,390 @@
1This file provides information on the module parameters of many of
2the Linux watchdog drivers. Watchdog driver parameter specs should
3be listed here unless the driver has its own driver-specific information
4file.
5
6
7See Documentation/kernel-parameters.txt for information on
8providing kernel parameters for builtin drivers versus loadable
9modules.
10
11
12-------------------------------------------------
13acquirewdt:
14wdt_stop: Acquire WDT 'stop' io port (default 0x43)
15wdt_start: Acquire WDT 'start' io port (default 0x443)
16nowayout: Watchdog cannot be stopped once started
17 (default=kernel config parameter)
18-------------------------------------------------
19advantechwdt:
20wdt_stop: Advantech WDT 'stop' io port (default 0x443)
21wdt_start: Advantech WDT 'start' io port (default 0x443)
22timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60.
23nowayout: Watchdog cannot be stopped once started
24 (default=kernel config parameter)
25-------------------------------------------------
26alim1535_wdt:
27timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60
28nowayout: Watchdog cannot be stopped once started
29 (default=kernel config parameter)
30-------------------------------------------------
31alim7101_wdt:
32timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30
33use_gpio: Use the gpio watchdog (required by old cobalt boards).
34 default=0/off/no
35nowayout: Watchdog cannot be stopped once started
36 (default=kernel config parameter)
37-------------------------------------------------
38ar7_wdt:
39margin: Watchdog margin in seconds (default=60)
40nowayout: Disable watchdog shutdown on close
41 (default=kernel config parameter)
42-------------------------------------------------
43at32ap700x_wdt:
44timeout: Timeout value. Limited to be 1 or 2 seconds. (default=2)
45nowayout: Watchdog cannot be stopped once started
46 (default=kernel config parameter)
47-------------------------------------------------
48at91rm9200_wdt:
49wdt_time: Watchdog time in seconds. (default=5)
50nowayout: Watchdog cannot be stopped once started
51 (default=kernel config parameter)
52-------------------------------------------------
53at91sam9_wdt:
54heartbeat: Watchdog heartbeats in seconds. (default = 15)
55nowayout: Watchdog cannot be stopped once started
56 (default=kernel config parameter)
57-------------------------------------------------
58bcm47xx_wdt:
59wdt_time: Watchdog time in seconds. (default=30)
60nowayout: Watchdog cannot be stopped once started
61 (default=kernel config parameter)
62-------------------------------------------------
63bfin_wdt:
64timeout: Watchdog timeout in seconds. (1<=timeout<=((2^32)/SCLK), default=20)
65nowayout: Watchdog cannot be stopped once started
66 (default=kernel config parameter)
67-------------------------------------------------
68coh901327_wdt:
69margin: Watchdog margin in seconds (default 60s)
70-------------------------------------------------
71cpu5wdt:
72port: base address of watchdog card, default is 0x91
73verbose: be verbose, default is 0 (no)
74ticks: count down ticks, default is 10000
75-------------------------------------------------
76cpwd:
77wd0_timeout: Default watchdog0 timeout in 1/10secs
78wd1_timeout: Default watchdog1 timeout in 1/10secs
79wd2_timeout: Default watchdog2 timeout in 1/10secs
80-------------------------------------------------
81davinci_wdt:
82heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60
83-------------------------------------------------
84ep93xx_wdt:
85nowayout: Watchdog cannot be stopped once started
86timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD)
87-------------------------------------------------
88eurotechwdt:
89nowayout: Watchdog cannot be stopped once started
90 (default=kernel config parameter)
91io: Eurotech WDT io port (default=0x3f0)
92irq: Eurotech WDT irq (default=10)
93ev: Eurotech WDT event type (default is `int')
94-------------------------------------------------
95gef_wdt:
96nowayout: Watchdog cannot be stopped once started
97 (default=kernel config parameter)
98-------------------------------------------------
99geodewdt:
100timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60.
101nowayout: Watchdog cannot be stopped once started
102 (default=kernel config parameter)
103-------------------------------------------------
104i6300esb:
105heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30)
106nowayout: Watchdog cannot be stopped once started
107 (default=kernel config parameter)
108-------------------------------------------------
109iTCO_wdt:
110heartbeat: Watchdog heartbeat in seconds.
111 (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30)
112nowayout: Watchdog cannot be stopped once started
113 (default=kernel config parameter)
114-------------------------------------------------
115iTCO_vendor_support:
116vendorsupport: iTCO vendor specific support mode, default=0 (none),
117 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS
118-------------------------------------------------
119ib700wdt:
120timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30.
121nowayout: Watchdog cannot be stopped once started
122 (default=kernel config parameter)
123-------------------------------------------------
124ibmasr:
125nowayout: Watchdog cannot be stopped once started
126 (default=kernel config parameter)
127-------------------------------------------------
128indydog:
129nowayout: Watchdog cannot be stopped once started
130 (default=kernel config parameter)
131-------------------------------------------------
132iop_wdt:
133nowayout: Watchdog cannot be stopped once started
134 (default=kernel config parameter)
135-------------------------------------------------
136it8712f_wdt:
137margin: Watchdog margin in seconds (default 60)
138nowayout: Disable watchdog shutdown on close
139 (default=kernel config parameter)
140-------------------------------------------------
141it87_wdt:
142nogameport: Forbid the activation of game port, default=0
143exclusive: Watchdog exclusive device open, default=1
144timeout: Watchdog timeout in seconds, default=60
145testmode: Watchdog test mode (1 = no reboot), default=0
146nowayout: Watchdog cannot be stopped once started
147 (default=kernel config parameter)
148-------------------------------------------------
149ixp2000_wdt:
150heartbeat: Watchdog heartbeat in seconds (default 60s)
151nowayout: Watchdog cannot be stopped once started
152 (default=kernel config parameter)
153-------------------------------------------------
154ixp4xx_wdt:
155heartbeat: Watchdog heartbeat in seconds (default 60s)
156nowayout: Watchdog cannot be stopped once started
157 (default=kernel config parameter)
158-------------------------------------------------
159ks8695_wdt:
160wdt_time: Watchdog time in seconds. (default=5)
161nowayout: Watchdog cannot be stopped once started
162 (default=kernel config parameter)
163-------------------------------------------------
164machzwd:
165nowayout: Watchdog cannot be stopped once started
166 (default=kernel config parameter)
167action: after watchdog resets, generate:
168 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI
169-------------------------------------------------
170max63xx_wdt:
171heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60
172nowayout: Watchdog cannot be stopped once started
173 (default=kernel config parameter)
174nodelay: Force selection of a timeout setting without initial delay
175 (max6373/74 only, default=0)
176-------------------------------------------------
177mixcomwd:
178nowayout: Watchdog cannot be stopped once started
179 (default=kernel config parameter)
180-------------------------------------------------
181mpc8xxx_wdt:
182timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535)
183reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset
184nowayout: Watchdog cannot be stopped once started
185 (default=kernel config parameter)
186-------------------------------------------------
187mpcore_wdt:
188mpcore_margin: MPcore timer margin in seconds.
189 (0 < mpcore_margin < 65536, default=60)
190nowayout: Watchdog cannot be stopped once started
191 (default=kernel config parameter)
192mpcore_noboot: MPcore watchdog action, set to 1 to ignore reboots,
193 0 to reboot (default=0
194-------------------------------------------------
195mv64x60_wdt:
196nowayout: Watchdog cannot be stopped once started
197 (default=kernel config parameter)
198-------------------------------------------------
199nuc900_wdt:
200heartbeat: Watchdog heartbeats in seconds.
201 (default = 15)
202nowayout: Watchdog cannot be stopped once started
203 (default=kernel config parameter)
204-------------------------------------------------
205omap_wdt:
206timer_margin: initial watchdog timeout (in seconds)
207-------------------------------------------------
208orion_wdt:
209heartbeat: Initial watchdog heartbeat in seconds
210nowayout: Watchdog cannot be stopped once started
211 (default=kernel config parameter)
212-------------------------------------------------
213pc87413_wdt:
214io: pc87413 WDT I/O port (default: io).
215timeout: Watchdog timeout in minutes (default=timeout).
216nowayout: Watchdog cannot be stopped once started
217 (default=kernel config parameter)
218-------------------------------------------------
219pika_wdt:
220heartbeat: Watchdog heartbeats in seconds. (default = 15)
221nowayout: Watchdog cannot be stopped once started
222 (default=kernel config parameter)
223-------------------------------------------------
224pnx4008_wdt:
225heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19
226nowayout: Set to 1 to keep watchdog running after device release
227-------------------------------------------------
228pnx833x_wdt:
229timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds)
230nowayout: Watchdog cannot be stopped once started
231 (default=kernel config parameter)
232start_enabled: Watchdog is started on module insertion (default=1)
233-------------------------------------------------
234rc32434_wdt:
235timeout: Watchdog timeout value, in seconds (default=20)
236nowayout: Watchdog cannot be stopped once started
237 (default=kernel config parameter)
238-------------------------------------------------
239riowd:
240riowd_timeout: Watchdog timeout in minutes (default=1)
241-------------------------------------------------
242s3c2410_wdt:
243tmr_margin: Watchdog tmr_margin in seconds. (default=15)
244tmr_atboot: Watchdog is started at boot time if set to 1, default=0
245nowayout: Watchdog cannot be stopped once started
246 (default=kernel config parameter)
247soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot
248debug: Watchdog debug, set to >1 for debug, (default 0)
249-------------------------------------------------
250sa1100_wdt:
251margin: Watchdog margin in seconds (default 60s)
252-------------------------------------------------
253sb_wdog:
254timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs)
255-------------------------------------------------
256sbc60xxwdt:
257wdt_stop: SBC60xx WDT 'stop' io port (default 0x45)
258wdt_start: SBC60xx WDT 'start' io port (default 0x443)
259timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
260nowayout: Watchdog cannot be stopped once started
261 (default=kernel config parameter)
262-------------------------------------------------
263sbc7240_wdt:
264timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30)
265nowayout: Disable watchdog when closing device file
266-------------------------------------------------
267sbc8360:
268timeout: Index into timeout table (0-63) (default=27 (60s))
269nowayout: Watchdog cannot be stopped once started
270 (default=kernel config parameter)
271-------------------------------------------------
272sbc_epx_c3:
273nowayout: Watchdog cannot be stopped once started
274 (default=kernel config parameter)
275-------------------------------------------------
276sbc_fitpc2_wdt:
277margin: Watchdog margin in seconds (default 60s)
278nowayout: Watchdog cannot be stopped once started
279-------------------------------------------------
280sc1200wdt:
281isapnp: When set to 0 driver ISA PnP support will be disabled (default=1)
282io: io port
283timeout: range is 0-255 minutes, default is 1
284nowayout: Watchdog cannot be stopped once started
285 (default=kernel config parameter)
286-------------------------------------------------
287sc520_wdt:
288timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30)
289nowayout: Watchdog cannot be stopped once started
290 (default=kernel config parameter)
291-------------------------------------------------
292sch311x_wdt:
293force_id: Override the detected device ID
294therm_trip: Should a ThermTrip trigger the reset generator
295timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60
296nowayout: Watchdog cannot be stopped once started
297 (default=kernel config parameter)
298-------------------------------------------------
299scx200_wdt:
300margin: Watchdog margin in seconds
301nowayout: Disable watchdog shutdown on close
302-------------------------------------------------
303shwdt:
304clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms)
305 to 0x7 (5.25ms). (default=7)
306heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30
307nowayout: Watchdog cannot be stopped once started
308 (default=kernel config parameter)
309-------------------------------------------------
310smsc37b787_wdt:
311timeout: range is 1-255 units, default is 60
312nowayout: Watchdog cannot be stopped once started
313 (default=kernel config parameter)
314-------------------------------------------------
315softdog:
316soft_margin: Watchdog soft_margin in seconds.
317 (0 < soft_margin < 65536, default=60)
318nowayout: Watchdog cannot be stopped once started
319 (default=kernel config parameter)
320soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot
321 (default=0)
322-------------------------------------------------
323stmp3xxx_wdt:
324heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19
325-------------------------------------------------
326ts72xx_wdt:
327timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8)
328nowayout: Disable watchdog shutdown on close
329-------------------------------------------------
330twl4030_wdt:
331nowayout: Watchdog cannot be stopped once started
332 (default=kernel config parameter)
333-------------------------------------------------
334txx9wdt:
335timeout: Watchdog timeout in seconds. (0<timeout<N, default=60)
336nowayout: Watchdog cannot be stopped once started
337 (default=kernel config parameter)
338-------------------------------------------------
339w83627hf_wdt:
340wdt_io: w83627hf/thf WDT io port (default 0x2E)
341timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
342nowayout: Watchdog cannot be stopped once started
343 (default=kernel config parameter)
344-------------------------------------------------
345w83697hf_wdt:
346wdt_io: w83697hf/hg WDT io port (default 0x2e, 0 = autodetect)
347timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
348nowayout: Watchdog cannot be stopped once started
349 (default=kernel config parameter)
350early_disable: Watchdog gets disabled at boot time (default=1)
351-------------------------------------------------
352w83697ug_wdt:
353wdt_io: w83697ug/uf WDT io port (default 0x2e)
354timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
355nowayout: Watchdog cannot be stopped once started
356 (default=kernel config parameter)
357-------------------------------------------------
358w83877f_wdt:
359timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
360nowayout: Watchdog cannot be stopped once started
361 (default=kernel config parameter)
362-------------------------------------------------
363w83977f_wdt:
364timeout: Watchdog timeout in seconds (15..7635), default=45)
365testmode: Watchdog testmode (1 = no reboot), default=0
366nowayout: Watchdog cannot be stopped once started
367 (default=kernel config parameter)
368-------------------------------------------------
369wafer5823wdt:
370timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
371nowayout: Watchdog cannot be stopped once started
372 (default=kernel config parameter)
373-------------------------------------------------
374wdt285:
375soft_margin: Watchdog timeout in seconds (default=60)
376-------------------------------------------------
377wdt977:
378timeout: Watchdog timeout in seconds (60..15300, default=60)
379testmode: Watchdog testmode (1 = no reboot), default=0
380nowayout: Watchdog cannot be stopped once started
381 (default=kernel config parameter)
382-------------------------------------------------
383wm831x_wdt:
384nowayout: Watchdog cannot be stopped once started
385 (default=kernel config parameter)
386-------------------------------------------------
387wm8350_wdt:
388nowayout: Watchdog cannot be stopped once started
389 (default=kernel config parameter)
390-------------------------------------------------
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt
index 03fd756d976d..061c2e35384f 100644
--- a/Documentation/watchdog/wdt.txt
+++ b/Documentation/watchdog/wdt.txt
@@ -14,14 +14,22 @@ reboot will depend on the state of the machines and interrupts. The hardware
14boards physically pull the machine down off their own onboard timers and 14boards physically pull the machine down off their own onboard timers and
15will reboot from almost anything. 15will reboot from almost anything.
16 16
17A second temperature monitoring interface is available on the WDT501P cards 17A second temperature monitoring interface is available on the WDT501P cards.
18This provides /dev/temperature. This is the machine internal temperature in 18This provides /dev/temperature. This is the machine internal temperature in
19degrees Fahrenheit. Each read returns a single byte giving the temperature. 19degrees Fahrenheit. Each read returns a single byte giving the temperature.
20 20
21The third interface logs kernel messages on additional alert events. 21The third interface logs kernel messages on additional alert events.
22 22
23The wdt card cannot be safely probed for. Instead you need to pass 23The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to
24wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11". 24pass IO address and IRQ boot parameters. E.g.:
25 wdt.io=0x240 wdt.irq=11
26
27Other "wdt" driver parameters are:
28 heartbeat Watchdog heartbeat in seconds (default 60)
29 nowayout Watchdog cannot be stopped once started (kernel
30 build parameter)
31 tachometer WDT501-P Fan Tachometer support (0=disable, default=0)
32 type WDT501-P Card type (500 or 501, default=500)
25 33
26Features 34Features
27-------- 35--------
@@ -40,4 +48,3 @@ Minor numbers are however allocated for it.
40 48
41 49
42Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c 50Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c
43