aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/ABI/obsolete/sysfs-bus-usb31
-rw-r--r--Documentation/ABI/obsolete/sysfs-class-rfkill29
-rw-r--r--Documentation/ABI/stable/sysfs-class-rfkill67
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node7
-rw-r--r--Documentation/ABI/testing/sysfs-block14
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci40
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb39
-rw-r--r--Documentation/ABI/testing/sysfs-class-power20
-rw-r--r--Documentation/ABI/testing/sysfs-devices-memory2
-rw-r--r--Documentation/ABI/testing/sysfs-devices-node7
-rw-r--r--Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget9
-rw-r--r--Documentation/ABI/testing/sysfs-devices-power79
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-picolcd43
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-prodikeys29
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-roccat-kone111
-rw-r--r--Documentation/ABI/testing/sysfs-platform-asus-laptop12
-rw-r--r--Documentation/ABI/testing/sysfs-platform-eeepc-laptop10
-rw-r--r--Documentation/ABI/testing/sysfs-power13
-rw-r--r--Documentation/ABI/testing/sysfs-wacom10
-rw-r--r--Documentation/Changes2
-rw-r--r--Documentation/DMA-API-HOWTO.txt (renamed from Documentation/PCI/PCI-DMA-mapping.txt)423
-rw-r--r--Documentation/DMA-API.txt122
-rw-r--r--Documentation/DocBook/Makefile2
-rw-r--r--Documentation/DocBook/device-drivers.tmpl2
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl2
-rw-r--r--Documentation/DocBook/drm.tmpl839
-rw-r--r--Documentation/DocBook/kgdb.tmpl692
-rw-r--r--Documentation/DocBook/libata.tmpl65
-rw-r--r--Documentation/DocBook/mac80211.tmpl3
-rw-r--r--Documentation/DocBook/media-entities.tmpl11
-rw-r--r--Documentation/DocBook/mtdnand.tmpl8
-rw-r--r--Documentation/DocBook/sh.tmpl10
-rw-r--r--Documentation/DocBook/tracepoint.tmpl13
-rw-r--r--Documentation/DocBook/v4l/common.xml2
-rw-r--r--Documentation/DocBook/v4l/compat.xml126
-rw-r--r--Documentation/DocBook/v4l/controls.xml36
-rw-r--r--Documentation/DocBook/v4l/dev-event.xml31
-rw-r--r--Documentation/DocBook/v4l/io.xml21
-rw-r--r--Documentation/DocBook/v4l/pixfmt.xml12
-rw-r--r--Documentation/DocBook/v4l/v4l2.xml3
-rw-r--r--Documentation/DocBook/v4l/videodev2.h.xml10
-rw-r--r--Documentation/DocBook/v4l/vidioc-dqevent.xml131
-rw-r--r--Documentation/DocBook/v4l/vidioc-enuminput.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-g-parm.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-qbuf.xml54
-rw-r--r--Documentation/DocBook/v4l/vidioc-querybuf.xml7
-rw-r--r--Documentation/DocBook/v4l/vidioc-queryctrl.xml2
-rw-r--r--Documentation/DocBook/v4l/vidioc-reqbufs.xml36
-rw-r--r--Documentation/DocBook/v4l/vidioc-subscribe-event.xml133
-rw-r--r--Documentation/DocBook/writing-an-alsa-driver.tmpl27
-rw-r--r--Documentation/DocBook/writing_usb_driver.tmpl2
-rw-r--r--Documentation/HOWTO115
-rw-r--r--Documentation/IPMI.txt12
-rw-r--r--Documentation/Makefile4
-rw-r--r--Documentation/PCI/pci-error-recovery.txt4
-rw-r--r--Documentation/PCI/pcieaer-howto.txt29
-rw-r--r--Documentation/RCU/00-INDEX10
-rw-r--r--Documentation/RCU/NMI-RCU.txt39
-rw-r--r--Documentation/RCU/RTFP.txt61
-rw-r--r--Documentation/RCU/checklist.txt215
-rw-r--r--Documentation/RCU/lockdep.txt91
-rw-r--r--Documentation/RCU/rcu.txt48
-rw-r--r--Documentation/RCU/stallwarn.txt106
-rw-r--r--Documentation/RCU/torture.txt22
-rw-r--r--Documentation/RCU/trace.txt35
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--Documentation/Smack.txt2
-rw-r--r--Documentation/SubmitChecklist20
-rw-r--r--Documentation/SubmittingDrivers5
-rw-r--r--Documentation/arm/00-INDEX2
-rw-r--r--Documentation/arm/SA1100/ADSBitsy2
-rw-r--r--Documentation/arm/SPEAr/overview.txt60
-rw-r--r--Documentation/arm/Samsung-S3C24XX/CPUfreq.txt4
-rw-r--r--Documentation/arm/Samsung/Overview.txt86
-rwxr-xr-xDocumentation/arm/Samsung/clksrc-change-registers.awk167
-rw-r--r--Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen2
-rw-r--r--Documentation/arm/memory.txt6
-rw-r--r--Documentation/atomic_ops.txt2
-rw-r--r--Documentation/blackfin/bfin-gpio-notes.txt2
-rw-r--r--Documentation/block/biodoc.txt4
-rw-r--r--Documentation/block/queue-sysfs.txt10
-rw-r--r--Documentation/cachetlb.txt36
-rw-r--r--Documentation/cdrom/ide-cd39
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--Documentation/cgroups/cgroup_event_listener.c110
-rw-r--r--Documentation/cgroups/cgroups.txt44
-rw-r--r--Documentation/cgroups/cpusets.txt161
-rw-r--r--Documentation/cgroups/memcg_test.txt49
-rw-r--r--Documentation/cgroups/memory.txt378
-rw-r--r--Documentation/circular-buffers.txt234
-rw-r--r--Documentation/connector/cn_test.c1
-rw-r--r--Documentation/connector/connector.txt2
-rw-r--r--Documentation/console/console.txt2
-rw-r--r--Documentation/cpu-freq/pcc-cpufreq.txt207
-rw-r--r--Documentation/credentials.txt14
-rw-r--r--Documentation/development-process/2.Process29
-rw-r--r--Documentation/development-process/7.AdvancedTopics2
-rw-r--r--Documentation/device-mapper/snapshot.txt44
-rw-r--r--Documentation/devices.txt2
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/driver-model/platform.txt2
-rw-r--r--Documentation/dvb/ci.txt2
-rw-r--r--Documentation/dvb/contributors.txt2
-rw-r--r--Documentation/dvb/get_dvb_firmware23
-rw-r--r--Documentation/eisa.txt2
-rw-r--r--Documentation/email-clients.txt30
-rw-r--r--Documentation/fault-injection/provoke-crashes.txt38
-rw-r--r--Documentation/fb/efifb.txt (renamed from Documentation/fb/imacfb.txt)14
-rw-r--r--Documentation/feature-removal-schedule.txt214
-rw-r--r--Documentation/filesystems/00-INDEX6
-rw-r--r--Documentation/filesystems/9p.txt18
-rw-r--r--Documentation/filesystems/Locking25
-rw-r--r--Documentation/filesystems/Makefile8
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/ceph.txt140
-rw-r--r--Documentation/filesystems/dentry-locking.txt3
-rw-r--r--Documentation/filesystems/dlmfs.txt2
-rw-r--r--Documentation/filesystems/dnotify.txt39
-rw-r--r--Documentation/filesystems/dnotify_test.c34
-rw-r--r--Documentation/filesystems/ext3.txt15
-rw-r--r--Documentation/filesystems/fiemap.txt12
-rw-r--r--Documentation/filesystems/fuse.txt4
-rw-r--r--Documentation/filesystems/gfs2.txt12
-rw-r--r--Documentation/filesystems/hpfs.txt2
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt7
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt2
-rw-r--r--Documentation/filesystems/nilfs2.txt7
-rw-r--r--Documentation/filesystems/ocfs2.txt7
-rw-r--r--Documentation/filesystems/proc.txt68
-rw-r--r--Documentation/filesystems/sharedsubtree.txt16
-rw-r--r--Documentation/filesystems/smbfs.txt2
-rw-r--r--Documentation/filesystems/squashfs.txt32
-rw-r--r--Documentation/filesystems/sysfs-tagging.txt42
-rw-r--r--Documentation/filesystems/tmpfs.txt16
-rw-r--r--Documentation/filesystems/vfs.txt2
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.txt816
-rw-r--r--Documentation/gpio.txt64
-rw-r--r--Documentation/hwmon/abituguru2
-rw-r--r--Documentation/hwmon/adt741142
-rw-r--r--Documentation/hwmon/adt747374
-rw-r--r--Documentation/hwmon/asc7621296
-rw-r--r--Documentation/hwmon/it8753
-rw-r--r--Documentation/hwmon/lm852
-rw-r--r--Documentation/hwmon/lm9022
-rw-r--r--Documentation/i2c/busses/i2c-i80111
-rw-r--r--Documentation/i2c/busses/i2c-parport3
-rw-r--r--Documentation/i2c/busses/i2c-parport-light11
-rw-r--r--Documentation/i2c/smbus-protocol16
-rw-r--r--Documentation/i2c/writing-clients10
-rw-r--r--Documentation/init.txt49
-rw-r--r--Documentation/input/elantech.txt8
-rw-r--r--Documentation/input/joystick.txt2
-rw-r--r--Documentation/input/multi-touch-protocol.txt23
-rw-r--r--Documentation/input/rotary-encoder.txt2
-rw-r--r--Documentation/input/sentelic.txt124
-rw-r--r--Documentation/intel_txt.txt18
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/isdn/INTERFACE.CAPI9
-rw-r--r--Documentation/isdn/README.gigaset10
-rw-r--r--Documentation/kbuild/kconfig-language.txt2
-rw-r--r--Documentation/kbuild/kconfig.txt2
-rw-r--r--Documentation/kernel-docs.txt10
-rw-r--r--Documentation/kernel-parameters.txt124
-rw-r--r--Documentation/kobject.txt62
-rw-r--r--Documentation/kprobes.txt203
-rw-r--r--Documentation/kvm/api.txt220
-rw-r--r--Documentation/kvm/cpuid.txt42
-rw-r--r--Documentation/kvm/mmu.txt304
-rw-r--r--Documentation/laptops/00-INDEX6
-rw-r--r--Documentation/laptops/Makefile8
-rw-r--r--Documentation/laptops/dslm.c166
-rw-r--r--Documentation/laptops/laptop-mode.txt172
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt70
-rw-r--r--Documentation/lguest/lguest.c3
-rw-r--r--Documentation/md.txt2
-rw-r--r--Documentation/memory-barriers.txt20
-rw-r--r--Documentation/netlabel/lsm_interface.txt2
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/Makefile2
-rw-r--r--Documentation/networking/caif/Linux-CAIF.txt212
-rw-r--r--Documentation/networking/caif/README109
-rw-r--r--Documentation/networking/cxacru-cf.py48
-rw-r--r--Documentation/networking/cxacru.txt16
-rw-r--r--Documentation/networking/dccp.txt6
-rw-r--r--Documentation/networking/ifenslave.c2
-rw-r--r--Documentation/networking/ip-sysctl.txt89
-rwxr-xr-xDocumentation/networking/ixgbevf.txt90
-rw-r--r--Documentation/networking/l2tp.txt247
-rw-r--r--Documentation/networking/packet_mmap.txt12
-rw-r--r--Documentation/networking/regulatory.txt24
-rw-r--r--Documentation/networking/skfp.txt2
-rw-r--r--Documentation/networking/stmmac.txt143
-rw-r--r--Documentation/networking/tcp-thin.txt47
-rw-r--r--Documentation/networking/timestamping.txt76
-rw-r--r--Documentation/networking/timestamping/Makefile11
-rw-r--r--Documentation/networking/timestamping/timestamping.c12
-rw-r--r--Documentation/networking/x25-iface.txt16
-rw-r--r--Documentation/oops-tracing.txt4
-rw-r--r--Documentation/padata.txt107
-rw-r--r--Documentation/pcmcia/driver-changes.txt13
-rw-r--r--Documentation/pcmcia/locking.txt118
-rw-r--r--Documentation/pnp.txt13
-rw-r--r--Documentation/power/devices.txt847
-rw-r--r--Documentation/power/pci.txt1258
-rw-r--r--Documentation/power/pm_qos_interface.txt48
-rw-r--r--Documentation/power/regulator/consumer.txt10
-rw-r--r--Documentation/power/regulator/machine.txt2
-rw-r--r--Documentation/power/regulator/overview.txt6
-rw-r--r--Documentation/power/runtime_pm.txt95
-rw-r--r--Documentation/power/userland-swsusp.txt4
-rw-r--r--Documentation/powerpc/booting-without-of.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/4xx/reboot.txt18
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt22
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/can.txt53
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt54
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/dma.txt8
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/i2c.txt30
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt70
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/mpc5200.txt9
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/spi.txt7
-rw-r--r--Documentation/powerpc/dts-bindings/xilinx.txt2
-rw-r--r--Documentation/powerpc/phyp-assisted-dump.txt2
-rw-r--r--Documentation/powerpc/ptrace.txt134
-rw-r--r--Documentation/rbtree.txt58
-rw-r--r--Documentation/rfkill.txt44
-rw-r--r--Documentation/rt-mutex-design.txt2
-rw-r--r--Documentation/s390/CommonIO6
-rw-r--r--Documentation/s390/driver-model.txt4
-rw-r--r--Documentation/s390/kvm.txt2
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt54
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/scsi/ChangeLog.lpfc14
-rw-r--r--Documentation/scsi/ChangeLog.megaraid_sas16
-rw-r--r--Documentation/scsi/FlashPoint.txt2
-rw-r--r--Documentation/scsi/dtc3x80.txt2
-rw-r--r--Documentation/scsi/ncr53c8xx.txt2
-rw-r--r--Documentation/scsi/osst.txt2
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt4
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt2
-rw-r--r--Documentation/serial/tty.txt4
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt58
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt5
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt47
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt4
-rw-r--r--Documentation/sound/alsa/soc/machine.txt2
-rw-r--r--Documentation/sound/alsa/soc/overview.txt2
-rw-r--r--Documentation/sparse.txt4
-rw-r--r--Documentation/spi/ep93xx_spi95
-rw-r--r--Documentation/spi/spidev_fdx.c4
-rw-r--r--Documentation/spi/spidev_test.c2
-rw-r--r--Documentation/stable_kernel_rules.txt9
-rw-r--r--Documentation/sysctl/net.txt10
-rw-r--r--Documentation/sysctl/vm.txt30
-rw-r--r--Documentation/sysfs-rules.txt2
-rw-r--r--Documentation/sysrq.txt14
-rw-r--r--Documentation/timers/00-INDEX2
-rw-r--r--Documentation/timers/Makefile8
-rw-r--r--Documentation/timers/hpet.txt273
-rw-r--r--Documentation/timers/hpet_example.c267
-rw-r--r--Documentation/trace/events.txt11
-rw-r--r--Documentation/trace/ftrace-design.txt5
-rw-r--r--Documentation/trace/ftrace.txt52
-rw-r--r--Documentation/trace/kprobetrace.txt61
-rw-r--r--Documentation/usb/WUSB-Design-overview.txt2
-rw-r--r--Documentation/usb/bulk-streams.txt78
-rw-r--r--Documentation/usb/dma.txt22
-rw-r--r--Documentation/usb/error-codes.txt6
-rw-r--r--Documentation/usb/gadget_hid.txt445
-rw-r--r--Documentation/usb/power-management.txt250
-rw-r--r--Documentation/usb/usb-serial.txt29
-rw-r--r--Documentation/video4linux/CARDLIST.bttv2
-rw-r--r--Documentation/video4linux/CARDLIST.cx238851
-rw-r--r--Documentation/video4linux/CARDLIST.cx881
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx4
-rw-r--r--Documentation/video4linux/CARDLIST.saa71344
-rw-r--r--Documentation/video4linux/CARDLIST.tuner1
-rw-r--r--Documentation/video4linux/README.tlg230047
-rw-r--r--Documentation/video4linux/extract_xc3028.pl817
-rw-r--r--Documentation/video4linux/gspca.txt30
-rw-r--r--Documentation/video4linux/sh_mobile_ceu_camera.txt80
-rw-r--r--Documentation/video4linux/v4l2-framework.txt249
-rw-r--r--Documentation/video4linux/videobuf360
-rw-r--r--Documentation/vm/00-INDEX16
-rw-r--r--Documentation/vm/Makefile2
-rw-r--r--Documentation/vm/hugepage-mmap.c91
-rw-r--r--Documentation/vm/hugepage-shm.c98
-rw-r--r--Documentation/vm/hugetlbpage.txt169
-rw-r--r--Documentation/vm/map_hugetlb.c8
-rw-r--r--Documentation/vm/numa186
-rw-r--r--Documentation/vm/numa_memory_policy.txt4
-rw-r--r--Documentation/vm/slub.txt1
-rw-r--r--Documentation/volatile-considered-harmful.txt6
-rw-r--r--Documentation/voyager.txt95
-rw-r--r--Documentation/w1/w1.generic2
-rw-r--r--Documentation/watchdog/00-INDEX5
-rw-r--r--Documentation/watchdog/src/watchdog-simple.c3
-rw-r--r--Documentation/watchdog/src/watchdog-test.c8
-rw-r--r--Documentation/watchdog/watchdog-api.txt5
-rw-r--r--Documentation/watchdog/watchdog-parameters.txt390
-rw-r--r--Documentation/watchdog/wdt.txt15
-rw-r--r--Documentation/x86/x86_64/boot-options.txt20
303 files changed, 15379 insertions, 3756 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 06b982affe76..dd10b51b4e65 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -250,6 +250,8 @@ numastat.txt
250 - info on how to read Numa policy hit/miss statistics in sysfs. 250 - info on how to read Numa policy hit/miss statistics in sysfs.
251oops-tracing.txt 251oops-tracing.txt
252 - how to decode those nasty internal kernel error dump messages. 252 - how to decode those nasty internal kernel error dump messages.
253padata.txt
254 - An introduction to the "padata" parallel execution API
253parisc/ 255parisc/
254 - directory with info on using Linux on PA-RISC architecture. 256 - directory with info on using Linux on PA-RISC architecture.
255parport.txt 257parport.txt
diff --git a/Documentation/ABI/obsolete/sysfs-bus-usb b/Documentation/ABI/obsolete/sysfs-bus-usb
new file mode 100644
index 000000000000..bd096d33fbc7
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-bus-usb
@@ -0,0 +1,31 @@
1What: /sys/bus/usb/devices/.../power/level
2Date: March 2007
3KernelVersion: 2.6.21
4Contact: Alan Stern <stern@rowland.harvard.edu>
5Description:
6 Each USB device directory will contain a file named
7 power/level. This file holds a power-level setting for
8 the device, either "on" or "auto".
9
10 "on" means that the device is not allowed to autosuspend,
11 although normal suspends for system sleep will still
12 be honored. "auto" means the device will autosuspend
13 and autoresume in the usual manner, according to the
14 capabilities of its driver.
15
16 During normal use, devices should be left in the "auto"
17 level. The "on" level is meant for administrative uses.
18 If you want to suspend a device immediately but leave it
19 free to wake up in response to I/O requests, you should
20 write "0" to power/autosuspend.
21
22 Device not capable of proper suspend and resume should be
23 left in the "on" level. Although the USB spec requires
24 devices to support suspend/resume, many of them do not.
25 In fact so many don't that by default, the USB core
26 initializes all non-hub devices in the "on" level. Some
27 drivers may change this setting when they are bound.
28
29 This file is deprecated and will be removed after 2010.
30 Use the power/control file instead; it does exactly the
31 same thing.
diff --git a/Documentation/ABI/obsolete/sysfs-class-rfkill b/Documentation/ABI/obsolete/sysfs-class-rfkill
new file mode 100644
index 000000000000..4201d5b05515
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-class-rfkill
@@ -0,0 +1,29 @@
1rfkill - radio frequency (RF) connector kill switch support
2
3For details to this subsystem look at Documentation/rfkill.txt.
4
5What: /sys/class/rfkill/rfkill[0-9]+/state
6Date: 09-Jul-2007
7KernelVersion v2.6.22
8Contact: linux-wireless@vger.kernel.org
9Description: Current state of the transmitter.
10 This file is deprecated and sheduled to be removed in 2014,
11 because its not possible to express the 'soft and hard block'
12 state of the rfkill driver.
13Values: A numeric value.
14 0: RFKILL_STATE_SOFT_BLOCKED
15 transmitter is turned off by software
16 1: RFKILL_STATE_UNBLOCKED
17 transmitter is (potentially) active
18 2: RFKILL_STATE_HARD_BLOCKED
19 transmitter is forced off by something outside of
20 the driver's control.
21
22What: /sys/class/rfkill/rfkill[0-9]+/claim
23Date: 09-Jul-2007
24KernelVersion v2.6.22
25Contact: linux-wireless@vger.kernel.org
26Description: This file is deprecated because there no longer is a way to
27 claim just control over a single rfkill instance.
28 This file is scheduled to be removed in 2012.
29Values: 0: Kernel handles events
diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill
new file mode 100644
index 000000000000..097f522c33bb
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-class-rfkill
@@ -0,0 +1,67 @@
1rfkill - radio frequency (RF) connector kill switch support
2
3For details to this subsystem look at Documentation/rfkill.txt.
4
5For the deprecated /sys/class/rfkill/*/state and
6/sys/class/rfkill/*/claim knobs of this interface look in
7Documentation/ABI/obsolete/sysfs-class-rfkill.
8
9What: /sys/class/rfkill
10Date: 09-Jul-2007
11KernelVersion: v2.6.22
12Contact: linux-wireless@vger.kernel.org,
13Description: The rfkill class subsystem folder.
14 Each registered rfkill driver is represented by an rfkillX
15 subfolder (X being an integer > 0).
16
17
18What: /sys/class/rfkill/rfkill[0-9]+/name
19Date: 09-Jul-2007
20KernelVersion v2.6.22
21Contact: linux-wireless@vger.kernel.org
22Description: Name assigned by driver to this key (interface or driver name).
23Values: arbitrary string.
24
25
26What: /sys/class/rfkill/rfkill[0-9]+/type
27Date: 09-Jul-2007
28KernelVersion v2.6.22
29Contact: linux-wireless@vger.kernel.org
30Description: Driver type string ("wlan", "bluetooth", etc).
31Values: See include/linux/rfkill.h.
32
33
34What: /sys/class/rfkill/rfkill[0-9]+/persistent
35Date: 09-Jul-2007
36KernelVersion v2.6.22
37Contact: linux-wireless@vger.kernel.org
38Description: Whether the soft blocked state is initialised from non-volatile
39 storage at startup.
40Values: A numeric value.
41 0: false
42 1: true
43
44
45What: /sys/class/rfkill/rfkill[0-9]+/hard
46Date: 12-March-2010
47KernelVersion v2.6.34
48Contact: linux-wireless@vger.kernel.org
49Description: Current hardblock state. This file is read only.
50Values: A numeric value.
51 0: inactive
52 The transmitter is (potentially) active.
53 1: active
54 The transmitter is forced off by something outside of
55 the driver's control.
56
57
58What: /sys/class/rfkill/rfkill[0-9]+/soft
59Date: 12-March-2010
60KernelVersion v2.6.34
61Contact: linux-wireless@vger.kernel.org
62Description: Current softblock state. This file is read and write.
63Values: A numeric value.
64 0: inactive
65 The transmitter is (potentially) active.
66 1: active
67 The transmitter is turned off by software.
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
new file mode 100644
index 000000000000..49b82cad7003
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -0,0 +1,7 @@
1What: /sys/devices/system/node/nodeX
2Date: October 2002
3Contact: Linux Memory Management list <linux-mm@kvack.org>
4Description:
5 When CONFIG_NUMA is enabled, this is a directory containing
6 information on node X such as what CPUs are local to the
7 node.
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index d2f90334bb93..4873c759d535 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -128,3 +128,17 @@ Description:
128 preferred request size for workloads where sustained 128 preferred request size for workloads where sustained
129 throughput is desired. If no optimal I/O size is 129 throughput is desired. If no optimal I/O size is
130 reported this file contains 0. 130 reported this file contains 0.
131
132What: /sys/block/<disk>/queue/nomerges
133Date: January 2010
134Contact:
135Description:
136 Standard I/O elevator operations include attempts to
137 merge contiguous I/Os. For known random I/O loads these
138 attempts will always fail and result in extra cycles
139 being spent in the kernel. This allows one to turn off
140 this behavior on one of two ways: When set to 1, complex
141 merge checks are disabled, but the simple one-shot merges
142 with the previous I/O request are enabled. When set to 2,
143 all merge tries are disabled. The default value is 0 -
144 which enables all types of merge tries.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25be3250f7d6..428676cfa61e 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -133,6 +133,46 @@ Description:
133 The symbolic link points to the PCI device sysfs entry of the 133 The symbolic link points to the PCI device sysfs entry of the
134 Physical Function this device associates with. 134 Physical Function this device associates with.
135 135
136
137What: /sys/bus/pci/slots/...
138Date: April 2005 (possibly older)
139KernelVersion: 2.6.12 (possibly older)
140Contact: linux-pci@vger.kernel.org
141Description:
142 When the appropriate driver is loaded, it will create a
143 directory per claimed physical PCI slot in
144 /sys/bus/pci/slots/. The names of these directories are
145 specific to the driver, which in turn, are specific to the
146 platform, but in general, should match the label on the
147 machine's physical chassis.
148
149 The drivers that can create slot directories include the
150 PCI hotplug drivers, and as of 2.6.27, the pci_slot driver.
151
152 The slot directories contain, at a minimum, a file named
153 'address' which contains the PCI bus:device:function tuple.
154 Other files may appear as well, but are specific to the
155 driver.
156
157What: /sys/bus/pci/slots/.../function[0-7]
158Date: March 2010
159KernelVersion: 2.6.35
160Contact: linux-pci@vger.kernel.org
161Description:
162 If PCI slot directories (as described above) are created,
163 and the physical slot is actually populated with a device,
164 symbolic links in the slot directory pointing to the
165 device's PCI functions are created as well.
166
167What: /sys/bus/pci/devices/.../slot
168Date: March 2010
169KernelVersion: 2.6.35
170Contact: linux-pci@vger.kernel.org
171Description:
172 If PCI slot directories (as described above) are created,
173 a symbolic link pointing to the slot directory will be
174 created as well.
175
136What: /sys/bus/pci/slots/.../module 176What: /sys/bus/pci/slots/.../module
137Date: June 2009 177Date: June 2009
138Contact: linux-pci@vger.kernel.org 178Contact: linux-pci@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index a07c0f366f91..294aa864a60a 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -14,34 +14,6 @@ Description:
14 The autosuspend delay for newly-created devices is set to 14 The autosuspend delay for newly-created devices is set to
15 the value of the usbcore.autosuspend module parameter. 15 the value of the usbcore.autosuspend module parameter.
16 16
17What: /sys/bus/usb/devices/.../power/level
18Date: March 2007
19KernelVersion: 2.6.21
20Contact: Alan Stern <stern@rowland.harvard.edu>
21Description:
22 Each USB device directory will contain a file named
23 power/level. This file holds a power-level setting for
24 the device, either "on" or "auto".
25
26 "on" means that the device is not allowed to autosuspend,
27 although normal suspends for system sleep will still
28 be honored. "auto" means the device will autosuspend
29 and autoresume in the usual manner, according to the
30 capabilities of its driver.
31
32 During normal use, devices should be left in the "auto"
33 level. The "on" level is meant for administrative uses.
34 If you want to suspend a device immediately but leave it
35 free to wake up in response to I/O requests, you should
36 write "0" to power/autosuspend.
37
38 Device not capable of proper suspend and resume should be
39 left in the "on" level. Although the USB spec requires
40 devices to support suspend/resume, many of them do not.
41 In fact so many don't that by default, the USB core
42 initializes all non-hub devices in the "on" level. Some
43 drivers may change this setting when they are bound.
44
45What: /sys/bus/usb/devices/.../power/persist 17What: /sys/bus/usb/devices/.../power/persist
46Date: May 2007 18Date: May 2007
47KernelVersion: 2.6.23 19KernelVersion: 2.6.23
@@ -159,3 +131,14 @@ Description:
159 device. This is useful to ensure auto probing won't 131 device. This is useful to ensure auto probing won't
160 match the driver to the device. For example: 132 match the driver to the device. For example:
161 # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id 133 # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
134
135What: /sys/bus/usb/device/.../avoid_reset_quirk
136Date: December 2009
137Contact: Oliver Neukum <oliver@neukum.org>
138Description:
139 Writing 1 to this file tells the kernel that this
140 device will morph into another mode when it is reset.
141 Drivers will not use reset for error handling for
142 such devices.
143Users:
144 usb_modeswitch
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
new file mode 100644
index 000000000000..78c7baca3587
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -0,0 +1,20 @@
1What: /sys/class/power/ds2760-battery.*/charge_now
2Date: May 2010
3KernelVersion: 2.6.35
4Contact: Daniel Mack <daniel@caiaq.de>
5Description:
6 This file is writeable and can be used to set the current
7 coloumb counter value inside the battery monitor chip. This
8 is needed for unavoidable corrections of aging batteries.
9 A userspace daemon can monitor the battery charging logic
10 and once the counter drops out of considerable bounds, take
11 appropriate action.
12
13What: /sys/class/power/ds2760-battery.*/charge_full
14Date: May 2010
15KernelVersion: 2.6.35
16Contact: Daniel Mack <daniel@caiaq.de>
17Description:
18 This file is writeable and can be used to set the assumed
19 battery 'full level'. As batteries age, this value has to be
20 amended over time.
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index bf1627b02a03..aba7d989208c 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -43,7 +43,7 @@ Date: September 2008
43Contact: Badari Pulavarty <pbadari@us.ibm.com> 43Contact: Badari Pulavarty <pbadari@us.ibm.com>
44Description: 44Description:
45 The file /sys/devices/system/memory/memoryX/state 45 The file /sys/devices/system/memory/memoryX/state
46 is read-write. When read, it's contents show the 46 is read-write. When read, its contents show the
47 online/offline state of the memory section. When written, 47 online/offline state of the memory section. When written,
48 root can toggle the the online/offline state of a removable 48 root can toggle the the online/offline state of a removable
49 memory section (see removable file description above) 49 memory section (see removable file description above)
diff --git a/Documentation/ABI/testing/sysfs-devices-node b/Documentation/ABI/testing/sysfs-devices-node
new file mode 100644
index 000000000000..453a210c3ceb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-node
@@ -0,0 +1,7 @@
1What: /sys/devices/system/node/nodeX/compact
2Date: February 2010
3Contact: Mel Gorman <mel@csn.ul.ie>
4Description:
5 When this file is written to, all memory within that node
6 will be compacted. When it completes, memory will be freed
7 into blocks which have as many contiguous pages as possible
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget
new file mode 100644
index 000000000000..34034027b13c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget
@@ -0,0 +1,9 @@
1What: /sys/devices/platform/_UDC_/gadget/suspended
2Date: April 2010
3Contact: Fabien Chouteau <fabien.chouteau@barco.com>
4Description:
5 Show the suspend state of an USB composite gadget.
6 1 -> suspended
7 0 -> resumed
8
9 (_UDC_ is the name of the USB Device Controller driver)
diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
new file mode 100644
index 000000000000..6123c523bfd7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -0,0 +1,79 @@
1What: /sys/devices/.../power/
2Date: January 2009
3Contact: Rafael J. Wysocki <rjw@sisk.pl>
4Description:
5 The /sys/devices/.../power directory contains attributes
6 allowing the user space to check and modify some power
7 management related properties of given device.
8
9What: /sys/devices/.../power/wakeup
10Date: January 2009
11Contact: Rafael J. Wysocki <rjw@sisk.pl>
12Description:
13 The /sys/devices/.../power/wakeup attribute allows the user
14 space to check if the device is enabled to wake up the system
15 from sleep states, such as the memory sleep state (suspend to
16 RAM) and hibernation (suspend to disk), and to enable or disable
17 it to do that as desired.
18
19 Some devices support "wakeup" events, which are hardware signals
20 used to activate the system from a sleep state. Such devices
21 have one of the following two values for the sysfs power/wakeup
22 file:
23
24 + "enabled\n" to issue the events;
25 + "disabled\n" not to do so;
26
27 In that cases the user space can change the setting represented
28 by the contents of this file by writing either "enabled", or
29 "disabled" to it.
30
31 For the devices that are not capable of generating system wakeup
32 events this file contains "\n". In that cases the user space
33 cannot modify the contents of this file and the device cannot be
34 enabled to wake up the system.
35
36What: /sys/devices/.../power/control
37Date: January 2009
38Contact: Rafael J. Wysocki <rjw@sisk.pl>
39Description:
40 The /sys/devices/.../power/control attribute allows the user
41 space to control the run-time power management of the device.
42
43 All devices have one of the following two values for the
44 power/control file:
45
46 + "auto\n" to allow the device to be power managed at run time;
47 + "on\n" to prevent the device from being power managed;
48
49 The default for all devices is "auto", which means that they may
50 be subject to automatic power management, depending on their
51 drivers. Changing this attribute to "on" prevents the driver
52 from power managing the device at run time. Doing that while
53 the device is suspended causes it to be woken up.
54
55What: /sys/devices/.../power/async
56Date: January 2009
57Contact: Rafael J. Wysocki <rjw@sisk.pl>
58Description:
59 The /sys/devices/.../async attribute allows the user space to
60 enable or diasble the device's suspend and resume callbacks to
61 be executed asynchronously (ie. in separate threads, in parallel
62 with the main suspend/resume thread) during system-wide power
63 transitions (eg. suspend to RAM, hibernation).
64
65 All devices have one of the following two values for the
66 power/async file:
67
68 + "enabled\n" to permit the asynchronous suspend/resume;
69 + "disabled\n" to forbid it;
70
71 The value of this attribute may be changed by writing either
72 "enabled", or "disabled" to it.
73
74 It generally is unsafe to permit the asynchronous suspend/resume
75 of a device unless it is certain that all of the PM dependencies
76 of the device are known to the PM core. However, for some
77 devices this attribute is set to "enabled" by bus type code or
78 device drivers and in that cases it should be safe to leave the
79 default value.
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-picolcd b/Documentation/ABI/testing/sysfs-driver-hid-picolcd
new file mode 100644
index 000000000000..08579e7e1e89
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-picolcd
@@ -0,0 +1,43 @@
1What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode
2Date: March 2010
3Contact: Bruno Prémont <bonbons@linux-vserver.org>
4Description: Make it possible to switch the PicoLCD device between LCD
5 (firmware) and bootloader (flasher) operation modes.
6
7 Reading: returns list of available modes, the active mode being
8 enclosed in brackets ('[' and ']')
9
10 Writing: causes operation mode switch. Permitted values are
11 the non-active mode names listed when read.
12
13 Note: when switching mode the current PicoLCD HID device gets
14 disconnected and reconnects after above delay (see attribute
15 operation_mode_delay for its value).
16
17
18What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/operation_mode_delay
19Date: April 2010
20Contact: Bruno Prémont <bonbons@linux-vserver.org>
21Description: Delay PicoLCD waits before restarting in new mode when
22 operation_mode has changed.
23
24 Reading/Writing: It is expressed in ms and permitted range is
25 0..30000ms.
26
27
28What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/fb_update_rate
29Date: March 2010
30Contact: Bruno Prémont <bonbons@linux-vserver.org>
31Description: Make it possible to adjust defio refresh rate.
32
33 Reading: returns list of available refresh rates (expressed in Hz),
34 the active refresh rate being enclosed in brackets ('[' and ']')
35
36 Writing: accepts new refresh rate expressed in integer Hz
37 within permitted rates.
38
39 Note: As device can barely do 2 complete refreshes a second
40 it only makes sense to adjust this value if only one or two
41 tiles get changed and it's not appropriate to expect the application
42 to flush it's tiny changes explicitely at higher than default rate.
43
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-prodikeys b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys
new file mode 100644
index 000000000000..05d988c29a83
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-prodikeys
@@ -0,0 +1,29 @@
1What: /sys/bus/hid/drivers/prodikeys/.../channel
2Date: April 2010
3KernelVersion: 2.6.34
4Contact: Don Prince <dhprince.devel@yahoo.co.uk>
5Description:
6 Allows control (via software) the midi channel to which
7 that the pc-midi keyboard will output.midi data.
8 Range: 0..15
9 Type: Read/write
10What: /sys/bus/hid/drivers/prodikeys/.../sustain
11Date: April 2010
12KernelVersion: 2.6.34
13Contact: Don Prince <dhprince.devel@yahoo.co.uk>
14Description:
15 Allows control (via software) the sustain duration of a
16 note held by the pc-midi driver.
17 0 means sustain mode is disabled.
18 Range: 0..5000 (milliseconds)
19 Type: Read/write
20What: /sys/bus/hid/drivers/prodikeys/.../octave
21Date: April 2010
22KernelVersion: 2.6.34
23Contact: Don Prince <dhprince.devel@yahoo.co.uk>
24Description:
25 Controls the octave shift modifier in the pc-midi driver.
26 The octave can be shifted via software up/down 2 octaves.
27 0 means the no ocatve shift.
28 Range: -2..2 (minus 2 to plus 2)
29 Type: Read/Write
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
new file mode 100644
index 000000000000..88340a23ce91
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
@@ -0,0 +1,111 @@
1What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_dpi
2Date: March 2010
3Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
4Description: It is possible to switch the dpi setting of the mouse with the
5 press of a button.
6 When read, this file returns the raw number of the actual dpi
7 setting reported by the mouse. This number has to be further
8 processed to receive the real dpi value.
9
10 VALUE DPI
11 1 800
12 2 1200
13 3 1600
14 4 2000
15 5 2400
16 6 3200
17
18 This file is readonly.
19
20What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/actual_profile
21Date: March 2010
22Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
23Description: When read, this file returns the number of the actual profile.
24 This file is readonly.
25
26What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/firmware_version
27Date: March 2010
28Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
29Description: When read, this file returns the raw integer version number of the
30 firmware reported by the mouse. Using the integer value eases
31 further usage in other programs. To receive the real version
32 number the decimal point has to be shifted 2 positions to the
33 left. E.g. a returned value of 138 means 1.38
34 This file is readonly.
35
36What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/kone_driver_version
37Date: March 2010
38Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
39Description: When read, this file returns the driver version.
40 The format of the string is "v<major>.<minor>.<patchlevel>".
41 This attribute is used by the userland tools to find the sysfs-
42 paths of installed kone-mice and determine the capabilites of
43 the driver. Versions of this driver for old kernels replace
44 usbhid instead of generic-usb. The way to scan for this file
45 has been chosen to provide a consistent way for all supported
46 kernel versions.
47 This file is readonly.
48
49What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/profile[1-5]
50Date: March 2010
51Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
52Description: The mouse can store 5 profiles which can be switched by the
53 press of a button. A profile holds informations like button
54 mappings, sensitivity, the colors of the 5 leds and light
55 effects.
56 When read, these files return the respective profile. The
57 returned data is 975 bytes in size.
58 When written, this file lets one write the respective profile
59 data back to the mouse. The data has to be 975 bytes long.
60 The mouse will reject invalid data, whereas the profile number
61 stored in the profile doesn't need to fit the number of the
62 store.
63
64What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/settings
65Date: March 2010
66Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
67Description: When read, this file returns the settings stored in the mouse.
68 The size of the data is 36 bytes and holds information like the
69 startup_profile, tcu state and calibration_data.
70 When written, this file lets write settings back to the mouse.
71 The data has to be 36 bytes long. The mouse will reject invalid
72 data.
73
74What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/startup_profile
75Date: March 2010
76Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
77Description: The integer value of this attribute ranges from 1 to 5.
78 When read, this attribute returns the number of the profile
79 that's active when the mouse is powered on.
80 When written, this file sets the number of the startup profile
81 and the mouse activates this profile immediately.
82
83What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/tcu
84Date: March 2010
85Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
86Description: The mouse has a "Tracking Control Unit" which lets the user
87 calibrate the laser power to fit the mousepad surface.
88 When read, this file returns the current state of the TCU,
89 where 0 means off and 1 means on.
90 Writing 0 in this file will switch the TCU off.
91 Writing 1 in this file will start the calibration which takes
92 around 6 seconds to complete and activates the TCU.
93
94What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/weight
95Date: March 2010
96Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
97Description: The mouse can be equipped with one of four supplied weights
98 ranging from 5 to 20 grams which are recognized by the mouse
99 and its value can be read out. When read, this file returns the
100 raw value returned by the mouse which eases further processing
101 in other software.
102 The values map to the weights as follows:
103
104 VALUE WEIGHT
105 0 none
106 1 5g
107 2 10g
108 3 15g
109 4 20g
110
111 This file is readonly.
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop
index a1cb660c50cf..1d775390e856 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop
@@ -1,4 +1,4 @@
1What: /sys/devices/platform/asus-laptop/display 1What: /sys/devices/platform/asus_laptop/display
2Date: January 2007 2Date: January 2007
3KernelVersion: 2.6.20 3KernelVersion: 2.6.20
4Contact: "Corentin Chary" <corentincj@iksaif.net> 4Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -13,7 +13,7 @@ Description:
13 Ex: - 0 (0000b) means no display 13 Ex: - 0 (0000b) means no display
14 - 3 (0011b) CRT+LCD. 14 - 3 (0011b) CRT+LCD.
15 15
16What: /sys/devices/platform/asus-laptop/gps 16What: /sys/devices/platform/asus_laptop/gps
17Date: January 2007 17Date: January 2007
18KernelVersion: 2.6.20 18KernelVersion: 2.6.20
19Contact: "Corentin Chary" <corentincj@iksaif.net> 19Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -21,7 +21,7 @@ Description:
21 Control the gps device. 1 means on, 0 means off. 21 Control the gps device. 1 means on, 0 means off.
22Users: Lapsus 22Users: Lapsus
23 23
24What: /sys/devices/platform/asus-laptop/ledd 24What: /sys/devices/platform/asus_laptop/ledd
25Date: January 2007 25Date: January 2007
26KernelVersion: 2.6.20 26KernelVersion: 2.6.20
27Contact: "Corentin Chary" <corentincj@iksaif.net> 27Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -29,11 +29,11 @@ Description:
29 Some models like the W1N have a LED display that can be 29 Some models like the W1N have a LED display that can be
30 used to display several informations. 30 used to display several informations.
31 To control the LED display, use the following : 31 To control the LED display, use the following :
32 echo 0x0T000DDD > /sys/devices/platform/asus-laptop/ 32 echo 0x0T000DDD > /sys/devices/platform/asus_laptop/
33 where T control the 3 letters display, and DDD the 3 digits display. 33 where T control the 3 letters display, and DDD the 3 digits display.
34 The DDD table can be found in Documentation/laptops/asus-laptop.txt 34 The DDD table can be found in Documentation/laptops/asus-laptop.txt
35 35
36What: /sys/devices/platform/asus-laptop/bluetooth 36What: /sys/devices/platform/asus_laptop/bluetooth
37Date: January 2007 37Date: January 2007
38KernelVersion: 2.6.20 38KernelVersion: 2.6.20
39Contact: "Corentin Chary" <corentincj@iksaif.net> 39Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -42,7 +42,7 @@ Description:
42 This may control the led, the device or both. 42 This may control the led, the device or both.
43Users: Lapsus 43Users: Lapsus
44 44
45What: /sys/devices/platform/asus-laptop/wlan 45What: /sys/devices/platform/asus_laptop/wlan
46Date: January 2007 46Date: January 2007
47KernelVersion: 2.6.20 47KernelVersion: 2.6.20
48Contact: "Corentin Chary" <corentincj@iksaif.net> 48Contact: "Corentin Chary" <corentincj@iksaif.net>
diff --git a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop
index 7445dfb321b5..5b026c69587a 100644
--- a/Documentation/ABI/testing/sysfs-platform-eeepc-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-eeepc-laptop
@@ -1,4 +1,4 @@
1What: /sys/devices/platform/eeepc-laptop/disp 1What: /sys/devices/platform/eeepc/disp
2Date: May 2008 2Date: May 2008
3KernelVersion: 2.6.26 3KernelVersion: 2.6.26
4Contact: "Corentin Chary" <corentincj@iksaif.net> 4Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -9,21 +9,21 @@ Description:
9 - 3 = LCD+CRT 9 - 3 = LCD+CRT
10 If you run X11, you should use xrandr instead. 10 If you run X11, you should use xrandr instead.
11 11
12What: /sys/devices/platform/eeepc-laptop/camera 12What: /sys/devices/platform/eeepc/camera
13Date: May 2008 13Date: May 2008
14KernelVersion: 2.6.26 14KernelVersion: 2.6.26
15Contact: "Corentin Chary" <corentincj@iksaif.net> 15Contact: "Corentin Chary" <corentincj@iksaif.net>
16Description: 16Description:
17 Control the camera. 1 means on, 0 means off. 17 Control the camera. 1 means on, 0 means off.
18 18
19What: /sys/devices/platform/eeepc-laptop/cardr 19What: /sys/devices/platform/eeepc/cardr
20Date: May 2008 20Date: May 2008
21KernelVersion: 2.6.26 21KernelVersion: 2.6.26
22Contact: "Corentin Chary" <corentincj@iksaif.net> 22Contact: "Corentin Chary" <corentincj@iksaif.net>
23Description: 23Description:
24 Control the card reader. 1 means on, 0 means off. 24 Control the card reader. 1 means on, 0 means off.
25 25
26What: /sys/devices/platform/eeepc-laptop/cpufv 26What: /sys/devices/platform/eeepc/cpufv
27Date: Jun 2009 27Date: Jun 2009
28KernelVersion: 2.6.31 28KernelVersion: 2.6.31
29Contact: "Corentin Chary" <corentincj@iksaif.net> 29Contact: "Corentin Chary" <corentincj@iksaif.net>
@@ -42,7 +42,7 @@ Description:
42 `------------ Availables modes 42 `------------ Availables modes
43 For example, 0x301 means: mode 1 selected, 3 available modes. 43 For example, 0x301 means: mode 1 selected, 3 available modes.
44 44
45What: /sys/devices/platform/eeepc-laptop/available_cpufv 45What: /sys/devices/platform/eeepc/available_cpufv
46Date: Jun 2009 46Date: Jun 2009
47KernelVersion: 2.6.31 47KernelVersion: 2.6.31
48Contact: "Corentin Chary" <corentincj@iksaif.net> 48Contact: "Corentin Chary" <corentincj@iksaif.net>
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index dcff4d0623ad..d6a801f45b48 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -101,3 +101,16 @@ Description:
101 101
102 CAUTION: Using it will cause your machine's real-time (CMOS) 102 CAUTION: Using it will cause your machine's real-time (CMOS)
103 clock to be set to a random invalid time after a resume. 103 clock to be set to a random invalid time after a resume.
104
105What: /sys/power/pm_async
106Date: January 2009
107Contact: Rafael J. Wysocki <rjw@sisk.pl>
108Description:
109 The /sys/power/pm_async file controls the switch allowing the
110 user space to enable or disable asynchronous suspend and resume
111 of devices. If enabled, this feature will cause some device
112 drivers' suspend and resume callbacks to be executed in parallel
113 with each other and with the main suspend thread. It is enabled
114 if this file contains "1", which is the default. It may be
115 disabled by writing "0" to this file, in which case all devices
116 will be suspended and resumed synchronously.
diff --git a/Documentation/ABI/testing/sysfs-wacom b/Documentation/ABI/testing/sysfs-wacom
new file mode 100644
index 000000000000..1517976e25c4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-wacom
@@ -0,0 +1,10 @@
1What: /sys/class/hidraw/hidraw*/device/speed
2Date: April 2010
3Kernel Version: 2.6.35
4Contact: linux-bluetooth@vger.kernel.org
5Description:
6 The /sys/class/hidraw/hidraw*/device/speed file controls
7 reporting speed of wacom bluetooth tablet. Reading from
8 this file returns 1 if tablet reports in high speed mode
9 or 0 otherwise. Writing to this file one of these values
10 switches reporting speed.
diff --git a/Documentation/Changes b/Documentation/Changes
index f08b313cd235..eca9f6e6fbe6 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -49,7 +49,7 @@ o oprofile 0.9 # oprofiled --version
49o udev 081 # udevinfo -V 49o udev 081 # udevinfo -V
50o grub 0.93 # grub --version 50o grub 0.93 # grub --version
51o mcelog 0.6 51o mcelog 0.6
52o iptables 1.4.1 # iptables -V 52o iptables 1.4.2 # iptables -V
53 53
54 54
55Kernel compilation 55Kernel compilation
diff --git a/Documentation/PCI/PCI-DMA-mapping.txt b/Documentation/DMA-API-HOWTO.txt
index ecad88d9fe59..98ce51796f71 100644
--- a/Documentation/PCI/PCI-DMA-mapping.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -1,12 +1,12 @@
1 Dynamic DMA mapping 1 Dynamic DMA mapping Guide
2 =================== 2 =========================
3 3
4 David S. Miller <davem@redhat.com> 4 David S. Miller <davem@redhat.com>
5 Richard Henderson <rth@cygnus.com> 5 Richard Henderson <rth@cygnus.com>
6 Jakub Jelinek <jakub@redhat.com> 6 Jakub Jelinek <jakub@redhat.com>
7 7
8This document describes the DMA mapping system in terms of the pci_ 8This is a guide to device driver writers on how to use the DMA API
9API. For a similar API that works for generic devices, see 9with example pseudo-code. For a concise description of the API, see
10DMA-API.txt. 10DMA-API.txt.
11 11
12Most of the 64bit platforms have special hardware that translates bus 12Most of the 64bit platforms have special hardware that translates bus
@@ -26,12 +26,15 @@ mapped only for the time they are actually used and unmapped after the DMA
26transfer. 26transfer.
27 27
28The following API will work of course even on platforms where no such 28The following API will work of course even on platforms where no such
29hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on 29hardware exists.
30top of the virt_to_bus interface. 30
31Note that the DMA API works with any bus independent of the underlying
32microprocessor architecture. You should use the DMA API rather than
33the bus specific DMA API (e.g. pci_dma_*).
31 34
32First of all, you should make sure 35First of all, you should make sure
33 36
34#include <linux/pci.h> 37#include <linux/dma-mapping.h>
35 38
36is in your driver. This file will obtain for you the definition of the 39is in your driver. This file will obtain for you the definition of the
37dma_addr_t (which can hold any valid DMA address for the platform) 40dma_addr_t (which can hold any valid DMA address for the platform)
@@ -78,44 +81,43 @@ for you to DMA from/to.
78 DMA addressing limitations 81 DMA addressing limitations
79 82
80Does your device have any DMA addressing limitations? For example, is 83Does your device have any DMA addressing limitations? For example, is
81your device only capable of driving the low order 24-bits of address 84your device only capable of driving the low order 24-bits of address?
82on the PCI bus for SAC DMA transfers? If so, you need to inform the 85If so, you need to inform the kernel of this fact.
83PCI layer of this fact.
84 86
85By default, the kernel assumes that your device can address the full 87By default, the kernel assumes that your device can address the full
8632-bits in a SAC cycle. For a 64-bit DAC capable device, this needs 8832-bits. For a 64-bit capable device, this needs to be increased.
87to be increased. And for a device with limitations, as discussed in 89And for a device with limitations, as discussed in the previous
88the previous paragraph, it needs to be decreased. 90paragraph, it needs to be decreased.
89 91
90pci_alloc_consistent() by default will return 32-bit DMA addresses. 92Special note about PCI: PCI-X specification requires PCI-X devices to
91PCI-X specification requires PCI-X devices to support 64-bit 93support 64-bit addressing (DAC) for all transactions. And at least
92addressing (DAC) for all transactions. And at least one platform (SGI 94one platform (SGI SN2) requires 64-bit consistent allocations to
93SN2) requires 64-bit consistent allocations to operate correctly when 95operate correctly when the IO bus is in PCI-X mode.
94the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(), 96
95it's good practice to call pci_set_consistent_dma_mask() to set the 97For correct operation, you must interrogate the kernel in your device
96appropriate mask even if your device only supports 32-bit DMA 98probe routine to see if the DMA controller on the machine can properly
97(default) and especially if it's a PCI-X device. 99support the DMA addressing limitation your device has. It is good
98 100style to do this even if your device holds the default setting,
99For correct operation, you must interrogate the PCI layer in your
100device probe routine to see if the PCI controller on the machine can
101properly support the DMA addressing limitation your device has. It is
102good style to do this even if your device holds the default setting,
103because this shows that you did think about these issues wrt. your 101because this shows that you did think about these issues wrt. your
104device. 102device.
105 103
106The query is performed via a call to pci_set_dma_mask(): 104The query is performed via a call to dma_set_mask():
107 105
108 int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask); 106 int dma_set_mask(struct device *dev, u64 mask);
109 107
110The query for consistent allocations is performed via a call to 108The query for consistent allocations is performed via a call to
111pci_set_consistent_dma_mask(): 109dma_set_coherent_mask():
112 110
113 int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask); 111 int dma_set_coherent_mask(struct device *dev, u64 mask);
114 112
115Here, pdev is a pointer to the PCI device struct of your device, and 113Here, dev is a pointer to the device struct of your device, and mask
116device_mask is a bit mask describing which bits of a PCI address your 114is a bit mask describing which bits of an address your device
117device supports. It returns zero if your card can perform DMA 115supports. It returns zero if your card can perform DMA properly on
118properly on the machine given the address mask you provided. 116the machine given the address mask you provided. In general, the
117device struct of your device is embedded in the bus specific device
118struct of your device. For example, a pointer to the device struct of
119your PCI device is pdev->dev (pdev is a pointer to the PCI device
120struct of your device).
119 121
120If it returns non-zero, your device cannot perform DMA properly on 122If it returns non-zero, your device cannot perform DMA properly on
121this platform, and attempting to do so will result in undefined 123this platform, and attempting to do so will result in undefined
@@ -133,31 +135,30 @@ of your driver reports that performance is bad or that the device is not
133even detected, you can ask them for the kernel messages to find out 135even detected, you can ask them for the kernel messages to find out
134exactly why. 136exactly why.
135 137
136The standard 32-bit addressing PCI device would do something like 138The standard 32-bit addressing device would do something like this:
137this:
138 139
139 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { 140 if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
140 printk(KERN_WARNING 141 printk(KERN_WARNING
141 "mydev: No suitable DMA available.\n"); 142 "mydev: No suitable DMA available.\n");
142 goto ignore_this_device; 143 goto ignore_this_device;
143 } 144 }
144 145
145Another common scenario is a 64-bit capable device. The approach 146Another common scenario is a 64-bit capable device. The approach here
146here is to try for 64-bit DAC addressing, but back down to a 147is to try for 64-bit addressing, but back down to a 32-bit mask that
14732-bit mask should that fail. The PCI platform code may fail the 148should not fail. The kernel may fail the 64-bit mask not because the
14864-bit mask not because the platform is not capable of 64-bit 149platform is not capable of 64-bit addressing. Rather, it may fail in
149addressing. Rather, it may fail in this case simply because 150this case simply because 32-bit addressing is done more efficiently
15032-bit SAC addressing is done more efficiently than DAC addressing. 151than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
151Sparc64 is one platform which behaves in this way. 152more efficient than DAC addressing.
152 153
153Here is how you would handle a 64-bit capable device which can drive 154Here is how you would handle a 64-bit capable device which can drive
154all 64-bits when accessing streaming DMA: 155all 64-bits when accessing streaming DMA:
155 156
156 int using_dac; 157 int using_dac;
157 158
158 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { 159 if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
159 using_dac = 1; 160 using_dac = 1;
160 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { 161 } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
161 using_dac = 0; 162 using_dac = 0;
162 } else { 163 } else {
163 printk(KERN_WARNING 164 printk(KERN_WARNING
@@ -170,36 +171,36 @@ the case would look like this:
170 171
171 int using_dac, consistent_using_dac; 172 int using_dac, consistent_using_dac;
172 173
173 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { 174 if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
174 using_dac = 1; 175 using_dac = 1;
175 consistent_using_dac = 1; 176 consistent_using_dac = 1;
176 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); 177 dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
177 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { 178 } else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
178 using_dac = 0; 179 using_dac = 0;
179 consistent_using_dac = 0; 180 consistent_using_dac = 0;
180 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); 181 dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
181 } else { 182 } else {
182 printk(KERN_WARNING 183 printk(KERN_WARNING
183 "mydev: No suitable DMA available.\n"); 184 "mydev: No suitable DMA available.\n");
184 goto ignore_this_device; 185 goto ignore_this_device;
185 } 186 }
186 187
187pci_set_consistent_dma_mask() will always be able to set the same or a 188dma_set_coherent_mask() will always be able to set the same or a
188smaller mask as pci_set_dma_mask(). However for the rare case that a 189smaller mask as dma_set_mask(). However for the rare case that a
189device driver only uses consistent allocations, one would have to 190device driver only uses consistent allocations, one would have to
190check the return value from pci_set_consistent_dma_mask(). 191check the return value from dma_set_coherent_mask().
191 192
192Finally, if your device can only drive the low 24-bits of 193Finally, if your device can only drive the low 24-bits of
193address during PCI bus mastering you might do something like: 194address you might do something like:
194 195
195 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) { 196 if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
196 printk(KERN_WARNING 197 printk(KERN_WARNING
197 "mydev: 24-bit DMA addressing not available.\n"); 198 "mydev: 24-bit DMA addressing not available.\n");
198 goto ignore_this_device; 199 goto ignore_this_device;
199 } 200 }
200 201
201When pci_set_dma_mask() is successful, and returns zero, the PCI layer 202When dma_set_mask() is successful, and returns zero, the kernel saves
202saves away this mask you have provided. The PCI layer will use this 203away this mask you have provided. The kernel will use this
203information later when you make DMA mappings. 204information later when you make DMA mappings.
204 205
205There is a case which we are aware of at this time, which is worth 206There is a case which we are aware of at this time, which is worth
@@ -208,7 +209,7 @@ functions (for example a sound card provides playback and record
208functions) and the various different functions have _different_ 209functions) and the various different functions have _different_
209DMA addressing limitations, you may wish to probe each mask and 210DMA addressing limitations, you may wish to probe each mask and
210only provide the functionality which the machine can handle. It 211only provide the functionality which the machine can handle. It
211is important that the last call to pci_set_dma_mask() be for the 212is important that the last call to dma_set_mask() be for the
212most specific mask. 213most specific mask.
213 214
214Here is pseudo-code showing how this might be done: 215Here is pseudo-code showing how this might be done:
@@ -217,17 +218,17 @@ Here is pseudo-code showing how this might be done:
217 #define RECORD_ADDRESS_BITS DMA_BIT_MASK(24) 218 #define RECORD_ADDRESS_BITS DMA_BIT_MASK(24)
218 219
219 struct my_sound_card *card; 220 struct my_sound_card *card;
220 struct pci_dev *pdev; 221 struct device *dev;
221 222
222 ... 223 ...
223 if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) { 224 if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) {
224 card->playback_enabled = 1; 225 card->playback_enabled = 1;
225 } else { 226 } else {
226 card->playback_enabled = 0; 227 card->playback_enabled = 0;
227 printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n", 228 printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n",
228 card->name); 229 card->name);
229 } 230 }
230 if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) { 231 if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) {
231 card->record_enabled = 1; 232 card->record_enabled = 1;
232 } else { 233 } else {
233 card->record_enabled = 0; 234 card->record_enabled = 0;
@@ -252,8 +253,8 @@ There are two types of DMA mappings:
252 Think of "consistent" as "synchronous" or "coherent". 253 Think of "consistent" as "synchronous" or "coherent".
253 254
254 The current default is to return consistent memory in the low 32 255 The current default is to return consistent memory in the low 32
255 bits of the PCI bus space. However, for future compatibility you 256 bits of the bus space. However, for future compatibility you should
256 should set the consistent mask even if this default is fine for your 257 set the consistent mask even if this default is fine for your
257 driver. 258 driver.
258 259
259 Good examples of what to use consistent mappings for are: 260 Good examples of what to use consistent mappings for are:
@@ -285,9 +286,9 @@ There are two types of DMA mappings:
285 found in PCI bridges (such as by reading a register's value 286 found in PCI bridges (such as by reading a register's value
286 after writing it). 287 after writing it).
287 288
288- Streaming DMA mappings which are usually mapped for one DMA transfer, 289- Streaming DMA mappings which are usually mapped for one DMA
289 unmapped right after it (unless you use pci_dma_sync_* below) and for which 290 transfer, unmapped right after it (unless you use dma_sync_* below)
290 hardware can optimize for sequential accesses. 291 and for which hardware can optimize for sequential accesses.
291 292
292 This of "streaming" as "asynchronous" or "outside the coherency 293 This of "streaming" as "asynchronous" or "outside the coherency
293 domain". 294 domain".
@@ -302,8 +303,8 @@ There are two types of DMA mappings:
302 optimizations the hardware allows. To this end, when using 303 optimizations the hardware allows. To this end, when using
303 such mappings you must be explicit about what you want to happen. 304 such mappings you must be explicit about what you want to happen.
304 305
305Neither type of DMA mapping has alignment restrictions that come 306Neither type of DMA mapping has alignment restrictions that come from
306from PCI, although some devices may have such restrictions. 307the underlying bus, although some devices may have such restrictions.
307Also, systems with caches that aren't DMA-coherent will work better 308Also, systems with caches that aren't DMA-coherent will work better
308when the underlying buffers don't share cache lines with other data. 309when the underlying buffers don't share cache lines with other data.
309 310
@@ -315,33 +316,27 @@ you should do:
315 316
316 dma_addr_t dma_handle; 317 dma_addr_t dma_handle;
317 318
318 cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle); 319 cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
319
320where pdev is a struct pci_dev *. This may be called in interrupt context.
321You should use dma_alloc_coherent (see DMA-API.txt) for buses
322where devices don't have struct pci_dev (like ISA, EISA).
323 320
324This argument is needed because the DMA translations may be bus 321where device is a struct device *. This may be called in interrupt
325specific (and often is private to the bus which the device is attached 322context with the GFP_ATOMIC flag.
326to).
327 323
328Size is the length of the region you want to allocate, in bytes. 324Size is the length of the region you want to allocate, in bytes.
329 325
330This routine will allocate RAM for that region, so it acts similarly to 326This routine will allocate RAM for that region, so it acts similarly to
331__get_free_pages (but takes size instead of a page order). If your 327__get_free_pages (but takes size instead of a page order). If your
332driver needs regions sized smaller than a page, you may prefer using 328driver needs regions sized smaller than a page, you may prefer using
333the pci_pool interface, described below. 329the dma_pool interface, described below.
334 330
335The consistent DMA mapping interfaces, for non-NULL pdev, will by 331The consistent DMA mapping interfaces, for non-NULL dev, will by
336default return a DMA address which is SAC (Single Address Cycle) 332default return a DMA address which is 32-bit addressable. Even if the
337addressable. Even if the device indicates (via PCI dma mask) that it 333device indicates (via DMA mask) that it may address the upper 32-bits,
338may address the upper 32-bits and thus perform DAC cycles, consistent 334consistent allocation will only return > 32-bit addresses for DMA if
339allocation will only return > 32-bit PCI addresses for DMA if the 335the consistent DMA mask has been explicitly changed via
340consistent dma mask has been explicitly changed via 336dma_set_coherent_mask(). This is true of the dma_pool interface as
341pci_set_consistent_dma_mask(). This is true of the pci_pool interface 337well.
342as well. 338
343 339dma_alloc_coherent returns two values: the virtual address which you
344pci_alloc_consistent returns two values: the virtual address which you
345can use to access it from the CPU and dma_handle which you pass to the 340can use to access it from the CPU and dma_handle which you pass to the
346card. 341card.
347 342
@@ -354,54 +349,54 @@ buffer you receive will not cross a 64K boundary.
354 349
355To unmap and free such a DMA region, you call: 350To unmap and free such a DMA region, you call:
356 351
357 pci_free_consistent(pdev, size, cpu_addr, dma_handle); 352 dma_free_coherent(dev, size, cpu_addr, dma_handle);
358 353
359where pdev, size are the same as in the above call and cpu_addr and 354where dev, size are the same as in the above call and cpu_addr and
360dma_handle are the values pci_alloc_consistent returned to you. 355dma_handle are the values dma_alloc_coherent returned to you.
361This function may not be called in interrupt context. 356This function may not be called in interrupt context.
362 357
363If your driver needs lots of smaller memory regions, you can write 358If your driver needs lots of smaller memory regions, you can write
364custom code to subdivide pages returned by pci_alloc_consistent, 359custom code to subdivide pages returned by dma_alloc_coherent,
365or you can use the pci_pool API to do that. A pci_pool is like 360or you can use the dma_pool API to do that. A dma_pool is like
366a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages. 361a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages.
367Also, it understands common hardware constraints for alignment, 362Also, it understands common hardware constraints for alignment,
368like queue heads needing to be aligned on N byte boundaries. 363like queue heads needing to be aligned on N byte boundaries.
369 364
370Create a pci_pool like this: 365Create a dma_pool like this:
371 366
372 struct pci_pool *pool; 367 struct dma_pool *pool;
373 368
374 pool = pci_pool_create(name, pdev, size, align, alloc); 369 pool = dma_pool_create(name, dev, size, align, alloc);
375 370
376The "name" is for diagnostics (like a kmem_cache name); pdev and size 371The "name" is for diagnostics (like a kmem_cache name); dev and size
377are as above. The device's hardware alignment requirement for this 372are as above. The device's hardware alignment requirement for this
378type of data is "align" (which is expressed in bytes, and must be a 373type of data is "align" (which is expressed in bytes, and must be a
379power of two). If your device has no boundary crossing restrictions, 374power of two). If your device has no boundary crossing restrictions,
380pass 0 for alloc; passing 4096 says memory allocated from this pool 375pass 0 for alloc; passing 4096 says memory allocated from this pool
381must not cross 4KByte boundaries (but at that time it may be better to 376must not cross 4KByte boundaries (but at that time it may be better to
382go for pci_alloc_consistent directly instead). 377go for dma_alloc_coherent directly instead).
383 378
384Allocate memory from a pci pool like this: 379Allocate memory from a dma pool like this:
385 380
386 cpu_addr = pci_pool_alloc(pool, flags, &dma_handle); 381 cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
387 382
388flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor 383flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
389holding SMP locks), SLAB_ATOMIC otherwise. Like pci_alloc_consistent, 384holding SMP locks), SLAB_ATOMIC otherwise. Like dma_alloc_coherent,
390this returns two values, cpu_addr and dma_handle. 385this returns two values, cpu_addr and dma_handle.
391 386
392Free memory that was allocated from a pci_pool like this: 387Free memory that was allocated from a dma_pool like this:
393 388
394 pci_pool_free(pool, cpu_addr, dma_handle); 389 dma_pool_free(pool, cpu_addr, dma_handle);
395 390
396where pool is what you passed to pci_pool_alloc, and cpu_addr and 391where pool is what you passed to dma_pool_alloc, and cpu_addr and
397dma_handle are the values pci_pool_alloc returned. This function 392dma_handle are the values dma_pool_alloc returned. This function
398may be called in interrupt context. 393may be called in interrupt context.
399 394
400Destroy a pci_pool by calling: 395Destroy a dma_pool by calling:
401 396
402 pci_pool_destroy(pool); 397 dma_pool_destroy(pool);
403 398
404Make sure you've called pci_pool_free for all memory allocated 399Make sure you've called dma_pool_free for all memory allocated
405from a pool before you destroy the pool. This function may not 400from a pool before you destroy the pool. This function may not
406be called in interrupt context. 401be called in interrupt context.
407 402
@@ -411,15 +406,15 @@ The interfaces described in subsequent portions of this document
411take a DMA direction argument, which is an integer and takes on 406take a DMA direction argument, which is an integer and takes on
412one of the following values: 407one of the following values:
413 408
414 PCI_DMA_BIDIRECTIONAL 409 DMA_BIDIRECTIONAL
415 PCI_DMA_TODEVICE 410 DMA_TO_DEVICE
416 PCI_DMA_FROMDEVICE 411 DMA_FROM_DEVICE
417 PCI_DMA_NONE 412 DMA_NONE
418 413
419One should provide the exact DMA direction if you know it. 414One should provide the exact DMA direction if you know it.
420 415
421PCI_DMA_TODEVICE means "from main memory to the PCI device" 416DMA_TO_DEVICE means "from main memory to the device"
422PCI_DMA_FROMDEVICE means "from the PCI device to main memory" 417DMA_FROM_DEVICE means "from the device to main memory"
423It is the direction in which the data moves during the DMA 418It is the direction in which the data moves during the DMA
424transfer. 419transfer.
425 420
@@ -427,12 +422,12 @@ You are _strongly_ encouraged to specify this as precisely
427as you possibly can. 422as you possibly can.
428 423
429If you absolutely cannot know the direction of the DMA transfer, 424If you absolutely cannot know the direction of the DMA transfer,
430specify PCI_DMA_BIDIRECTIONAL. It means that the DMA can go in 425specify DMA_BIDIRECTIONAL. It means that the DMA can go in
431either direction. The platform guarantees that you may legally 426either direction. The platform guarantees that you may legally
432specify this, and that it will work, but this may be at the 427specify this, and that it will work, but this may be at the
433cost of performance for example. 428cost of performance for example.
434 429
435The value PCI_DMA_NONE is to be used for debugging. One can 430The value DMA_NONE is to be used for debugging. One can
436hold this in a data structure before you come to know the 431hold this in a data structure before you come to know the
437precise direction, and this will help catch cases where your 432precise direction, and this will help catch cases where your
438direction tracking logic has failed to set things up properly. 433direction tracking logic has failed to set things up properly.
@@ -442,21 +437,21 @@ potential platform-specific optimizations of such) is for debugging.
442Some platforms actually have a write permission boolean which DMA 437Some platforms actually have a write permission boolean which DMA
443mappings can be marked with, much like page protections in the user 438mappings can be marked with, much like page protections in the user
444program address space. Such platforms can and do report errors in the 439program address space. Such platforms can and do report errors in the
445kernel logs when the PCI controller hardware detects violation of the 440kernel logs when the DMA controller hardware detects violation of the
446permission setting. 441permission setting.
447 442
448Only streaming mappings specify a direction, consistent mappings 443Only streaming mappings specify a direction, consistent mappings
449implicitly have a direction attribute setting of 444implicitly have a direction attribute setting of
450PCI_DMA_BIDIRECTIONAL. 445DMA_BIDIRECTIONAL.
451 446
452The SCSI subsystem tells you the direction to use in the 447The SCSI subsystem tells you the direction to use in the
453'sc_data_direction' member of the SCSI command your driver is 448'sc_data_direction' member of the SCSI command your driver is
454working on. 449working on.
455 450
456For Networking drivers, it's a rather simple affair. For transmit 451For Networking drivers, it's a rather simple affair. For transmit
457packets, map/unmap them with the PCI_DMA_TODEVICE direction 452packets, map/unmap them with the DMA_TO_DEVICE direction
458specifier. For receive packets, just the opposite, map/unmap them 453specifier. For receive packets, just the opposite, map/unmap them
459with the PCI_DMA_FROMDEVICE direction specifier. 454with the DMA_FROM_DEVICE direction specifier.
460 455
461 Using Streaming DMA mappings 456 Using Streaming DMA mappings
462 457
@@ -467,43 +462,43 @@ scatterlist.
467 462
468To map a single region, you do: 463To map a single region, you do:
469 464
470 struct pci_dev *pdev = mydev->pdev; 465 struct device *dev = &my_dev->dev;
471 dma_addr_t dma_handle; 466 dma_addr_t dma_handle;
472 void *addr = buffer->ptr; 467 void *addr = buffer->ptr;
473 size_t size = buffer->len; 468 size_t size = buffer->len;
474 469
475 dma_handle = pci_map_single(pdev, addr, size, direction); 470 dma_handle = dma_map_single(dev, addr, size, direction);
476 471
477and to unmap it: 472and to unmap it:
478 473
479 pci_unmap_single(pdev, dma_handle, size, direction); 474 dma_unmap_single(dev, dma_handle, size, direction);
480 475
481You should call pci_unmap_single when the DMA activity is finished, e.g. 476You should call dma_unmap_single when the DMA activity is finished, e.g.
482from the interrupt which told you that the DMA transfer is done. 477from the interrupt which told you that the DMA transfer is done.
483 478
484Using cpu pointers like this for single mappings has a disadvantage, 479Using cpu pointers like this for single mappings has a disadvantage,
485you cannot reference HIGHMEM memory in this way. Thus, there is a 480you cannot reference HIGHMEM memory in this way. Thus, there is a
486map/unmap interface pair akin to pci_{map,unmap}_single. These 481map/unmap interface pair akin to dma_{map,unmap}_single. These
487interfaces deal with page/offset pairs instead of cpu pointers. 482interfaces deal with page/offset pairs instead of cpu pointers.
488Specifically: 483Specifically:
489 484
490 struct pci_dev *pdev = mydev->pdev; 485 struct device *dev = &my_dev->dev;
491 dma_addr_t dma_handle; 486 dma_addr_t dma_handle;
492 struct page *page = buffer->page; 487 struct page *page = buffer->page;
493 unsigned long offset = buffer->offset; 488 unsigned long offset = buffer->offset;
494 size_t size = buffer->len; 489 size_t size = buffer->len;
495 490
496 dma_handle = pci_map_page(pdev, page, offset, size, direction); 491 dma_handle = dma_map_page(dev, page, offset, size, direction);
497 492
498 ... 493 ...
499 494
500 pci_unmap_page(pdev, dma_handle, size, direction); 495 dma_unmap_page(dev, dma_handle, size, direction);
501 496
502Here, "offset" means byte offset within the given page. 497Here, "offset" means byte offset within the given page.
503 498
504With scatterlists, you map a region gathered from several regions by: 499With scatterlists, you map a region gathered from several regions by:
505 500
506 int i, count = pci_map_sg(pdev, sglist, nents, direction); 501 int i, count = dma_map_sg(dev, sglist, nents, direction);
507 struct scatterlist *sg; 502 struct scatterlist *sg;
508 503
509 for_each_sg(sglist, sg, count, i) { 504 for_each_sg(sglist, sg, count, i) {
@@ -527,16 +522,16 @@ accessed sg->address and sg->length as shown above.
527 522
528To unmap a scatterlist, just call: 523To unmap a scatterlist, just call:
529 524
530 pci_unmap_sg(pdev, sglist, nents, direction); 525 dma_unmap_sg(dev, sglist, nents, direction);
531 526
532Again, make sure DMA activity has already finished. 527Again, make sure DMA activity has already finished.
533 528
534PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be 529PLEASE NOTE: The 'nents' argument to the dma_unmap_sg call must be
535 the _same_ one you passed into the pci_map_sg call, 530 the _same_ one you passed into the dma_map_sg call,
536 it should _NOT_ be the 'count' value _returned_ from the 531 it should _NOT_ be the 'count' value _returned_ from the
537 pci_map_sg call. 532 dma_map_sg call.
538 533
539Every pci_map_{single,sg} call should have its pci_unmap_{single,sg} 534Every dma_map_{single,sg} call should have its dma_unmap_{single,sg}
540counterpart, because the bus address space is a shared resource (although 535counterpart, because the bus address space is a shared resource (although
541in some ports the mapping is per each BUS so less devices contend for the 536in some ports the mapping is per each BUS so less devices contend for the
542same bus address space) and you could render the machine unusable by eating 537same bus address space) and you could render the machine unusable by eating
@@ -547,14 +542,14 @@ the data in between the DMA transfers, the buffer needs to be synced
547properly in order for the cpu and device to see the most uptodate and 542properly in order for the cpu and device to see the most uptodate and
548correct copy of the DMA buffer. 543correct copy of the DMA buffer.
549 544
550So, firstly, just map it with pci_map_{single,sg}, and after each DMA 545So, firstly, just map it with dma_map_{single,sg}, and after each DMA
551transfer call either: 546transfer call either:
552 547
553 pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction); 548 dma_sync_single_for_cpu(dev, dma_handle, size, direction);
554 549
555or: 550or:
556 551
557 pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction); 552 dma_sync_sg_for_cpu(dev, sglist, nents, direction);
558 553
559as appropriate. 554as appropriate.
560 555
@@ -562,27 +557,27 @@ Then, if you wish to let the device get at the DMA area again,
562finish accessing the data with the cpu, and then before actually 557finish accessing the data with the cpu, and then before actually
563giving the buffer to the hardware call either: 558giving the buffer to the hardware call either:
564 559
565 pci_dma_sync_single_for_device(pdev, dma_handle, size, direction); 560 dma_sync_single_for_device(dev, dma_handle, size, direction);
566 561
567or: 562or:
568 563
569 pci_dma_sync_sg_for_device(dev, sglist, nents, direction); 564 dma_sync_sg_for_device(dev, sglist, nents, direction);
570 565
571as appropriate. 566as appropriate.
572 567
573After the last DMA transfer call one of the DMA unmap routines 568After the last DMA transfer call one of the DMA unmap routines
574pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_* 569dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_*
575call till pci_unmap_*, then you don't have to call the pci_dma_sync_* 570call till dma_unmap_*, then you don't have to call the dma_sync_*
576routines at all. 571routines at all.
577 572
578Here is pseudo code which shows a situation in which you would need 573Here is pseudo code which shows a situation in which you would need
579to use the pci_dma_sync_*() interfaces. 574to use the dma_sync_*() interfaces.
580 575
581 my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len) 576 my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
582 { 577 {
583 dma_addr_t mapping; 578 dma_addr_t mapping;
584 579
585 mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE); 580 mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE);
586 581
587 cp->rx_buf = buffer; 582 cp->rx_buf = buffer;
588 cp->rx_len = len; 583 cp->rx_len = len;
@@ -606,25 +601,25 @@ to use the pci_dma_sync_*() interfaces.
606 * the DMA transfer with the CPU first 601 * the DMA transfer with the CPU first
607 * so that we see updated contents. 602 * so that we see updated contents.
608 */ 603 */
609 pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma, 604 dma_sync_single_for_cpu(&cp->dev, cp->rx_dma,
610 cp->rx_len, 605 cp->rx_len,
611 PCI_DMA_FROMDEVICE); 606 DMA_FROM_DEVICE);
612 607
613 /* Now it is safe to examine the buffer. */ 608 /* Now it is safe to examine the buffer. */
614 hp = (struct my_card_header *) cp->rx_buf; 609 hp = (struct my_card_header *) cp->rx_buf;
615 if (header_is_ok(hp)) { 610 if (header_is_ok(hp)) {
616 pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len, 611 dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len,
617 PCI_DMA_FROMDEVICE); 612 DMA_FROM_DEVICE);
618 pass_to_upper_layers(cp->rx_buf); 613 pass_to_upper_layers(cp->rx_buf);
619 make_and_setup_new_rx_buf(cp); 614 make_and_setup_new_rx_buf(cp);
620 } else { 615 } else {
621 /* Just sync the buffer and give it back 616 /* Just sync the buffer and give it back
622 * to the card. 617 * to the card.
623 */ 618 */
624 pci_dma_sync_single_for_device(cp->pdev, 619 dma_sync_single_for_device(&cp->dev,
625 cp->rx_dma, 620 cp->rx_dma,
626 cp->rx_len, 621 cp->rx_len,
627 PCI_DMA_FROMDEVICE); 622 DMA_FROM_DEVICE);
628 give_rx_buf_to_card(cp); 623 give_rx_buf_to_card(cp);
629 } 624 }
630 } 625 }
@@ -634,19 +629,49 @@ Drivers converted fully to this interface should not use virt_to_bus any
634longer, nor should they use bus_to_virt. Some drivers have to be changed a 629longer, nor should they use bus_to_virt. Some drivers have to be changed a
635little bit, because there is no longer an equivalent to bus_to_virt in the 630little bit, because there is no longer an equivalent to bus_to_virt in the
636dynamic DMA mapping scheme - you have to always store the DMA addresses 631dynamic DMA mapping scheme - you have to always store the DMA addresses
637returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single 632returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single
638calls (pci_map_sg stores them in the scatterlist itself if the platform 633calls (dma_map_sg stores them in the scatterlist itself if the platform
639supports dynamic DMA mapping in hardware) in your driver structures and/or 634supports dynamic DMA mapping in hardware) in your driver structures and/or
640in the card registers. 635in the card registers.
641 636
642All PCI drivers should be using these interfaces with no exceptions. 637All drivers should be using these interfaces with no exceptions. It
643It is planned to completely remove virt_to_bus() and bus_to_virt() as 638is planned to completely remove virt_to_bus() and bus_to_virt() as
644they are entirely deprecated. Some ports already do not provide these 639they are entirely deprecated. Some ports already do not provide these
645as it is impossible to correctly support them. 640as it is impossible to correctly support them.
646 641
642 Handling Errors
643
644DMA address space is limited on some architectures and an allocation
645failure can be determined by:
646
647- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0
648
649- checking the returned dma_addr_t of dma_map_single and dma_map_page
650 by using dma_mapping_error():
651
652 dma_addr_t dma_handle;
653
654 dma_handle = dma_map_single(dev, addr, size, direction);
655 if (dma_mapping_error(dev, dma_handle)) {
656 /*
657 * reduce current DMA mapping usage,
658 * delay and try again later or
659 * reset driver.
660 */
661 }
662
663Networking drivers must call dev_kfree_skb to free the socket buffer
664and return NETDEV_TX_OK if the DMA mapping fails on the transmit hook
665(ndo_start_xmit). This means that the socket buffer is just dropped in
666the failure case.
667
668SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping
669fails in the queuecommand hook. This means that the SCSI subsystem
670passes the command to the driver again later.
671
647 Optimizing Unmap State Space Consumption 672 Optimizing Unmap State Space Consumption
648 673
649On many platforms, pci_unmap_{single,page}() is simply a nop. 674On many platforms, dma_unmap_{single,page}() is simply a nop.
650Therefore, keeping track of the mapping address and length is a waste 675Therefore, keeping track of the mapping address and length is a waste
651of space. Instead of filling your drivers up with ifdefs and the like 676of space. Instead of filling your drivers up with ifdefs and the like
652to "work around" this (which would defeat the whole purpose of a 677to "work around" this (which would defeat the whole purpose of a
@@ -655,7 +680,7 @@ portable API) the following facilities are provided.
655Actually, instead of describing the macros one by one, we'll 680Actually, instead of describing the macros one by one, we'll
656transform some example code. 681transform some example code.
657 682
6581) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures. 6831) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
659 Example, before: 684 Example, before:
660 685
661 struct ring_state { 686 struct ring_state {
@@ -668,14 +693,11 @@ transform some example code.
668 693
669 struct ring_state { 694 struct ring_state {
670 struct sk_buff *skb; 695 struct sk_buff *skb;
671 DECLARE_PCI_UNMAP_ADDR(mapping) 696 DEFINE_DMA_UNMAP_ADDR(mapping);
672 DECLARE_PCI_UNMAP_LEN(len) 697 DEFINE_DMA_UNMAP_LEN(len);
673 }; 698 };
674 699
675 NOTE: DO NOT put a semicolon at the end of the DECLARE_*() 7002) Use dma_unmap_{addr,len}_set to set these values.
676 macro.
677
6782) Use pci_unmap_{addr,len}_set to set these values.
679 Example, before: 701 Example, before:
680 702
681 ringp->mapping = FOO; 703 ringp->mapping = FOO;
@@ -683,21 +705,21 @@ transform some example code.
683 705
684 after: 706 after:
685 707
686 pci_unmap_addr_set(ringp, mapping, FOO); 708 dma_unmap_addr_set(ringp, mapping, FOO);
687 pci_unmap_len_set(ringp, len, BAR); 709 dma_unmap_len_set(ringp, len, BAR);
688 710
6893) Use pci_unmap_{addr,len} to access these values. 7113) Use dma_unmap_{addr,len} to access these values.
690 Example, before: 712 Example, before:
691 713
692 pci_unmap_single(pdev, ringp->mapping, ringp->len, 714 dma_unmap_single(dev, ringp->mapping, ringp->len,
693 PCI_DMA_FROMDEVICE); 715 DMA_FROM_DEVICE);
694 716
695 after: 717 after:
696 718
697 pci_unmap_single(pdev, 719 dma_unmap_single(dev,
698 pci_unmap_addr(ringp, mapping), 720 dma_unmap_addr(ringp, mapping),
699 pci_unmap_len(ringp, len), 721 dma_unmap_len(ringp, len),
700 PCI_DMA_FROMDEVICE); 722 DMA_FROM_DEVICE);
701 723
702It really should be self-explanatory. We treat the ADDR and LEN 724It really should be self-explanatory. We treat the ADDR and LEN
703separately, because it is possible for an implementation to only 725separately, because it is possible for an implementation to only
@@ -711,46 +733,29 @@ to "Closing".
711 733
7121) Struct scatterlist requirements. 7341) Struct scatterlist requirements.
713 735
714 Struct scatterlist must contain, at a minimum, the following 736 Don't invent the architecture specific struct scatterlist; just use
715 members: 737 <asm-generic/scatterlist.h>. You need to enable
716 738 CONFIG_NEED_SG_DMA_LENGTH if the architecture supports IOMMUs
717 struct page *page; 739 (including software IOMMU).
718 unsigned int offset;
719 unsigned int length;
720
721 The base address is specified by a "page+offset" pair.
722
723 Previous versions of struct scatterlist contained a "void *address"
724 field that was sometimes used instead of page+offset. As of Linux
725 2.5., page+offset is always used, and the "address" field has been
726 deleted.
727
7282) More to come...
729
730 Handling Errors
731
732DMA address space is limited on some architectures and an allocation
733failure can be determined by:
734 740
735- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0 7412) ARCH_KMALLOC_MINALIGN
736 742
737- checking the returned dma_addr_t of pci_map_single and pci_map_page 743 Architectures must ensure that kmalloc'ed buffer is
738 by using pci_dma_mapping_error(): 744 DMA-safe. Drivers and subsystems depend on it. If an architecture
745 isn't fully DMA-coherent (i.e. hardware doesn't ensure that data in
746 the CPU cache is identical to data in main memory),
747 ARCH_KMALLOC_MINALIGN must be set so that the memory allocator
748 makes sure that kmalloc'ed buffer doesn't share a cache line with
749 the others. See arch/arm/include/asm/cache.h as an example.
739 750
740 dma_addr_t dma_handle; 751 Note that ARCH_KMALLOC_MINALIGN is about DMA memory alignment
741 752 constraints. You don't need to worry about the architecture data
742 dma_handle = pci_map_single(pdev, addr, size, direction); 753 alignment constraints (e.g. the alignment constraints about 64-bit
743 if (pci_dma_mapping_error(pdev, dma_handle)) { 754 objects).
744 /*
745 * reduce current DMA mapping usage,
746 * delay and try again later or
747 * reset driver.
748 */
749 }
750 755
751 Closing 756 Closing
752 757
753This document, and the API itself, would not be in it's current 758This document, and the API itself, would not be in its current
754form without the feedback and suggestions from numerous individuals. 759form without the feedback and suggestions from numerous individuals.
755We would like to specifically mention, in no particular order, the 760We would like to specifically mention, in no particular order, the
756following people: 761following people:
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 5aceb88b3f8b..05e2ae236865 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -4,20 +4,18 @@
4 James E.J. Bottomley <James.Bottomley@HansenPartnership.com> 4 James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
5 5
6This document describes the DMA API. For a more gentle introduction 6This document describes the DMA API. For a more gentle introduction
7phrased in terms of the pci_ equivalents (and actual examples) see 7of the API (and actual examples) see
8Documentation/PCI/PCI-DMA-mapping.txt. 8Documentation/DMA-API-HOWTO.txt.
9 9
10This API is split into two pieces. Part I describes the API and the 10This API is split into two pieces. Part I describes the API. Part II
11corresponding pci_ API. Part II describes the extensions to the API 11describes the extensions to the API for supporting non-consistent
12for supporting non-consistent memory machines. Unless you know that 12memory machines. Unless you know that your driver absolutely has to
13your driver absolutely has to support non-consistent platforms (this 13support non-consistent platforms (this is usually only legacy
14is usually only legacy platforms) you should only use the API 14platforms) you should only use the API described in part I.
15described in part I.
16 15
17Part I - pci_ and dma_ Equivalent API 16Part I - dma_ API
18------------------------------------- 17-------------------------------------
19 18
20To get the pci_ API, you must #include <linux/pci.h>
21To get the dma_ API, you must #include <linux/dma-mapping.h> 19To get the dma_ API, you must #include <linux/dma-mapping.h>
22 20
23 21
@@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers
27void * 25void *
28dma_alloc_coherent(struct device *dev, size_t size, 26dma_alloc_coherent(struct device *dev, size_t size,
29 dma_addr_t *dma_handle, gfp_t flag) 27 dma_addr_t *dma_handle, gfp_t flag)
30void *
31pci_alloc_consistent(struct pci_dev *dev, size_t size,
32 dma_addr_t *dma_handle)
33 28
34Consistent memory is memory for which a write by either the device or 29Consistent memory is memory for which a write by either the device or
35the processor can immediately be read by the processor or device 30the processor can immediately be read by the processor or device
@@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below).
53The flag parameter (dma_alloc_coherent only) allows the caller to 48The flag parameter (dma_alloc_coherent only) allows the caller to
54specify the GFP_ flags (see kmalloc) for the allocation (the 49specify the GFP_ flags (see kmalloc) for the allocation (the
55implementation may choose to ignore flags that affect the location of 50implementation may choose to ignore flags that affect the location of
56the returned memory, like GFP_DMA). For pci_alloc_consistent, you 51the returned memory, like GFP_DMA).
57must assume GFP_ATOMIC behaviour.
58 52
59void 53void
60dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, 54dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
61 dma_addr_t dma_handle) 55 dma_addr_t dma_handle)
62void
63pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
64 dma_addr_t dma_handle)
65 56
66Free the region of consistent memory you previously allocated. dev, 57Free the region of consistent memory you previously allocated. dev,
67size and dma_handle must all be the same as those passed into the 58size and dma_handle must all be the same as those passed into the
@@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries.
89 dma_pool_create(const char *name, struct device *dev, 80 dma_pool_create(const char *name, struct device *dev,
90 size_t size, size_t align, size_t alloc); 81 size_t size, size_t align, size_t alloc);
91 82
92 struct pci_pool *
93 pci_pool_create(const char *name, struct pci_device *dev,
94 size_t size, size_t align, size_t alloc);
95
96The pool create() routines initialize a pool of dma-coherent buffers 83The pool create() routines initialize a pool of dma-coherent buffers
97for use with a given device. It must be called in a context which 84for use with a given device. It must be called in a context which
98can sleep. 85can sleep.
@@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries.
108 void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, 95 void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
109 dma_addr_t *dma_handle); 96 dma_addr_t *dma_handle);
110 97
111 void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
112 dma_addr_t *dma_handle);
113
114This allocates memory from the pool; the returned memory will meet the size 98This allocates memory from the pool; the returned memory will meet the size
115and alignment requirements specified at creation time. Pass GFP_ATOMIC to 99and alignment requirements specified at creation time. Pass GFP_ATOMIC to
116prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks), 100prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
@@ -122,9 +106,6 @@ pool's device.
122 void dma_pool_free(struct dma_pool *pool, void *vaddr, 106 void dma_pool_free(struct dma_pool *pool, void *vaddr,
123 dma_addr_t addr); 107 dma_addr_t addr);
124 108
125 void pci_pool_free(struct pci_pool *pool, void *vaddr,
126 dma_addr_t addr);
127
128This puts memory back into the pool. The pool is what was passed to 109This puts memory back into the pool. The pool is what was passed to
129the pool allocation routine; the cpu (vaddr) and dma addresses are what 110the pool allocation routine; the cpu (vaddr) and dma addresses are what
130were returned when that routine allocated the memory being freed. 111were returned when that routine allocated the memory being freed.
@@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed.
132 113
133 void dma_pool_destroy(struct dma_pool *pool); 114 void dma_pool_destroy(struct dma_pool *pool);
134 115
135 void pci_pool_destroy(struct pci_pool *pool);
136
137The pool destroy() routines free the resources of the pool. They must be 116The pool destroy() routines free the resources of the pool. They must be
138called in a context which can sleep. Make sure you've freed all allocated 117called in a context which can sleep. Make sure you've freed all allocated
139memory back to the pool before you destroy it. 118memory back to the pool before you destroy it.
@@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations
144 123
145int 124int
146dma_supported(struct device *dev, u64 mask) 125dma_supported(struct device *dev, u64 mask)
147int
148pci_dma_supported(struct pci_dev *hwdev, u64 mask)
149 126
150Checks to see if the device can support DMA to the memory described by 127Checks to see if the device can support DMA to the memory described by
151mask. 128mask.
@@ -159,8 +136,14 @@ driver writers.
159 136
160int 137int
161dma_set_mask(struct device *dev, u64 mask) 138dma_set_mask(struct device *dev, u64 mask)
139
140Checks to see if the mask is possible and updates the device
141parameters if it is.
142
143Returns: 0 if successful and a negative error if not.
144
162int 145int
163pci_set_dma_mask(struct pci_device *dev, u64 mask) 146dma_set_coherent_mask(struct device *dev, u64 mask)
164 147
165Checks to see if the mask is possible and updates the device 148Checks to see if the mask is possible and updates the device
166parameters if it is. 149parameters if it is.
@@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings
187dma_addr_t 170dma_addr_t
188dma_map_single(struct device *dev, void *cpu_addr, size_t size, 171dma_map_single(struct device *dev, void *cpu_addr, size_t size,
189 enum dma_data_direction direction) 172 enum dma_data_direction direction)
190dma_addr_t
191pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
192 int direction)
193 173
194Maps a piece of processor virtual memory so it can be accessed by the 174Maps a piece of processor virtual memory so it can be accessed by the
195device and returns the physical handle of the memory. 175device and returns the physical handle of the memory.
@@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting.
198However the dma_ API uses a strongly typed enumerator for its 178However the dma_ API uses a strongly typed enumerator for its
199direction: 179direction:
200 180
201DMA_NONE = PCI_DMA_NONE no direction (used for 181DMA_NONE no direction (used for debugging)
202 debugging) 182DMA_TO_DEVICE data is going from the memory to the device
203DMA_TO_DEVICE = PCI_DMA_TODEVICE data is going from the 183DMA_FROM_DEVICE data is coming from the device to the memory
204 memory to the device 184DMA_BIDIRECTIONAL direction isn't known
205DMA_FROM_DEVICE = PCI_DMA_FROMDEVICE data is coming from
206 the device to the
207 memory
208DMA_BIDIRECTIONAL = PCI_DMA_BIDIRECTIONAL direction isn't known
209 185
210Notes: Not all memory regions in a machine can be mapped by this 186Notes: Not all memory regions in a machine can be mapped by this
211API. Further, regions that appear to be physically contiguous in 187API. Further, regions that appear to be physically contiguous in
@@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed).
268void 244void
269dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, 245dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
270 enum dma_data_direction direction) 246 enum dma_data_direction direction)
271void
272pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
273 size_t size, int direction)
274 247
275Unmaps the region previously mapped. All the parameters passed in 248Unmaps the region previously mapped. All the parameters passed in
276must be identical to those passed in (and returned) by the mapping 249must be identical to those passed in (and returned) by the mapping
@@ -280,15 +253,9 @@ dma_addr_t
280dma_map_page(struct device *dev, struct page *page, 253dma_map_page(struct device *dev, struct page *page,
281 unsigned long offset, size_t size, 254 unsigned long offset, size_t size,
282 enum dma_data_direction direction) 255 enum dma_data_direction direction)
283dma_addr_t
284pci_map_page(struct pci_dev *hwdev, struct page *page,
285 unsigned long offset, size_t size, int direction)
286void 256void
287dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, 257dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
288 enum dma_data_direction direction) 258 enum dma_data_direction direction)
289void
290pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
291 size_t size, int direction)
292 259
293API for mapping and unmapping for pages. All the notes and warnings 260API for mapping and unmapping for pages. All the notes and warnings
294for the other mapping APIs apply here. Also, although the <offset> 261for the other mapping APIs apply here. Also, although the <offset>
@@ -299,9 +266,6 @@ cache width is.
299int 266int
300dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 267dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
301 268
302int
303pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
304
305In some circumstances dma_map_single and dma_map_page will fail to create 269In some circumstances dma_map_single and dma_map_page will fail to create
306a mapping. A driver can check for these errors by testing the returned 270a mapping. A driver can check for these errors by testing the returned
307dma address with dma_mapping_error(). A non-zero return value means the mapping 271dma address with dma_mapping_error(). A non-zero return value means the mapping
@@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later).
311 int 275 int
312 dma_map_sg(struct device *dev, struct scatterlist *sg, 276 dma_map_sg(struct device *dev, struct scatterlist *sg,
313 int nents, enum dma_data_direction direction) 277 int nents, enum dma_data_direction direction)
314 int
315 pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
316 int nents, int direction)
317 278
318Returns: the number of physical segments mapped (this may be shorter 279Returns: the number of physical segments mapped (this may be shorter
319than <nents> passed in if some elements of the scatter/gather list are 280than <nents> passed in if some elements of the scatter/gather list are
@@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above.
353 void 314 void
354 dma_unmap_sg(struct device *dev, struct scatterlist *sg, 315 dma_unmap_sg(struct device *dev, struct scatterlist *sg,
355 int nhwentries, enum dma_data_direction direction) 316 int nhwentries, enum dma_data_direction direction)
356 void
357 pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
358 int nents, int direction)
359 317
360Unmap the previously mapped scatter/gather list. All the parameters 318Unmap the previously mapped scatter/gather list. All the parameters
361must be the same as those and passed in to the scatter/gather mapping 319must be the same as those and passed in to the scatter/gather mapping
@@ -365,21 +323,23 @@ Note: <nents> must be the number you passed in, *not* the number of
365physical entries returned. 323physical entries returned.
366 324
367void 325void
368dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, 326dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
369 enum dma_data_direction direction) 327 enum dma_data_direction direction)
370void 328void
371pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, 329dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
372 size_t size, int direction) 330 enum dma_data_direction direction)
373void 331void
374dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, 332dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
375 enum dma_data_direction direction) 333 enum dma_data_direction direction)
376void 334void
377pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, 335dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
378 int nelems, int direction) 336 enum dma_data_direction direction)
379 337
380Synchronise a single contiguous or scatter/gather mapping. All the 338Synchronise a single contiguous or scatter/gather mapping for the cpu
381parameters must be the same as those passed into the single mapping 339and device. With the sync_sg API, all the parameters must be the same
382API. 340as those passed into the single mapping API. With the sync_single API,
341you can use dma_handle and size parameters that aren't identical to
342those passed into the single mapping API to do a partial sync.
383 343
384Notes: You must do this: 344Notes: You must do this:
385 345
@@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
461Part II - Advanced dma_ usage 421Part II - Advanced dma_ usage
462----------------------------- 422-----------------------------
463 423
464Warning: These pieces of the DMA API have no PCI equivalent. They 424Warning: These pieces of the DMA API should not be used in the
465should also not be used in the majority of cases, since they cater for 425majority of cases, since they cater for unlikely corner cases that
466unlikely corner cases that don't belong in usual drivers. 426don't belong in usual drivers.
467 427
468If you don't understand how cache line coherency works between a 428If you don't understand how cache line coherency works between a
469processor and an I/O device, you should not be using this part of the 429processor and an I/O device, you should not be using this part of the
@@ -514,16 +474,6 @@ into the width returned by this call. It will also always be a power
514of two for easy alignment. 474of two for easy alignment.
515 475
516void 476void
517dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
518 unsigned long offset, size_t size,
519 enum dma_data_direction direction)
520
521Does a partial sync, starting at offset and continuing for size. You
522must be careful to observe the cache alignment and width when doing
523anything like this. You must also be extra careful about accessing
524memory you intend to sync partially.
525
526void
527dma_cache_sync(struct device *dev, void *vaddr, size_t size, 477dma_cache_sync(struct device *dev, void *vaddr, size_t size,
528 enum dma_data_direction direction) 478 enum dma_data_direction direction)
529 479
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 325cfd1d6d99..c7e5dc7e8cb3 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -14,7 +14,7 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ 14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
15 mac80211.xml debugobjects.xml sh.xml regulator.xml \ 15 mac80211.xml debugobjects.xml sh.xml regulator.xml \
16 alsa-driver-api.xml writing-an-alsa-driver.xml \ 16 alsa-driver-api.xml writing-an-alsa-driver.xml \
17 tracepoint.xml media.xml 17 tracepoint.xml media.xml drm.xml
18 18
19### 19###
20# The build process is as follows (targets): 20# The build process is as follows (targets):
diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl
index f9a6e2c75f12..1b2dd4fc3db2 100644
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -45,7 +45,7 @@
45 </sect1> 45 </sect1>
46 46
47 <sect1><title>Atomic and pointer manipulation</title> 47 <sect1><title>Atomic and pointer manipulation</title>
48!Iarch/x86/include/asm/atomic_32.h 48!Iarch/x86/include/asm/atomic.h
49!Iarch/x86/include/asm/unaligned.h 49!Iarch/x86/include/asm/unaligned.h
50 </sect1> 50 </sect1>
51 51
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
index 3ed88126ab8f..c1ed6a49e598 100644
--- a/Documentation/DocBook/deviceiobook.tmpl
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -316,7 +316,7 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
316 316
317 <chapter id="pubfunctions"> 317 <chapter id="pubfunctions">
318 <title>Public Functions Provided</title> 318 <title>Public Functions Provided</title>
319!Iarch/x86/include/asm/io_32.h 319!Iarch/x86/include/asm/io.h
320!Elib/iomap.c 320!Elib/iomap.c
321 </chapter> 321 </chapter>
322 322
diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
new file mode 100644
index 000000000000..7583dc7cf64d
--- /dev/null
+++ b/Documentation/DocBook/drm.tmpl
@@ -0,0 +1,839 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="drmDevelopersGuide">
6 <bookinfo>
7 <title>Linux DRM Developer's Guide</title>
8
9 <copyright>
10 <year>2008-2009</year>
11 <holder>
12 Intel Corporation (Jesse Barnes &lt;jesse.barnes@intel.com&gt;)
13 </holder>
14 </copyright>
15
16 <legalnotice>
17 <para>
18 The contents of this file may be used under the terms of the GNU
19 General Public License version 2 (the "GPL") as distributed in
20 the kernel source COPYING file.
21 </para>
22 </legalnotice>
23 </bookinfo>
24
25<toc></toc>
26
27 <!-- Introduction -->
28
29 <chapter id="drmIntroduction">
30 <title>Introduction</title>
31 <para>
32 The Linux DRM layer contains code intended to support the needs
33 of complex graphics devices, usually containing programmable
34 pipelines well suited to 3D graphics acceleration. Graphics
35 drivers in the kernel can make use of DRM functions to make
36 tasks like memory management, interrupt handling and DMA easier,
37 and provide a uniform interface to applications.
38 </para>
39 <para>
40 A note on versions: this guide covers features found in the DRM
41 tree, including the TTM memory manager, output configuration and
42 mode setting, and the new vblank internals, in addition to all
43 the regular features found in current kernels.
44 </para>
45 <para>
46 [Insert diagram of typical DRM stack here]
47 </para>
48 </chapter>
49
50 <!-- Internals -->
51
52 <chapter id="drmInternals">
53 <title>DRM Internals</title>
54 <para>
55 This chapter documents DRM internals relevant to driver authors
56 and developers working to add support for the latest features to
57 existing drivers.
58 </para>
59 <para>
60 First, we'll go over some typical driver initialization
61 requirements, like setting up command buffers, creating an
62 initial output configuration, and initializing core services.
63 Subsequent sections will cover core internals in more detail,
64 providing implementation notes and examples.
65 </para>
66 <para>
67 The DRM layer provides several services to graphics drivers,
68 many of them driven by the application interfaces it provides
69 through libdrm, the library that wraps most of the DRM ioctls.
70 These include vblank event handling, memory
71 management, output management, framebuffer management, command
72 submission &amp; fencing, suspend/resume support, and DMA
73 services.
74 </para>
75 <para>
76 The core of every DRM driver is struct drm_device. Drivers
77 will typically statically initialize a drm_device structure,
78 then pass it to drm_init() at load time.
79 </para>
80
81 <!-- Internals: driver init -->
82
83 <sect1>
84 <title>Driver initialization</title>
85 <para>
86 Before calling the DRM initialization routines, the driver must
87 first create and fill out a struct drm_device structure.
88 </para>
89 <programlisting>
90 static struct drm_driver driver = {
91 /* don't use mtrr's here, the Xserver or user space app should
92 * deal with them for intel hardware.
93 */
94 .driver_features =
95 DRIVER_USE_AGP | DRIVER_REQUIRE_AGP |
96 DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_MODESET,
97 .load = i915_driver_load,
98 .unload = i915_driver_unload,
99 .firstopen = i915_driver_firstopen,
100 .lastclose = i915_driver_lastclose,
101 .preclose = i915_driver_preclose,
102 .save = i915_save,
103 .restore = i915_restore,
104 .device_is_agp = i915_driver_device_is_agp,
105 .get_vblank_counter = i915_get_vblank_counter,
106 .enable_vblank = i915_enable_vblank,
107 .disable_vblank = i915_disable_vblank,
108 .irq_preinstall = i915_driver_irq_preinstall,
109 .irq_postinstall = i915_driver_irq_postinstall,
110 .irq_uninstall = i915_driver_irq_uninstall,
111 .irq_handler = i915_driver_irq_handler,
112 .reclaim_buffers = drm_core_reclaim_buffers,
113 .get_map_ofs = drm_core_get_map_ofs,
114 .get_reg_ofs = drm_core_get_reg_ofs,
115 .fb_probe = intelfb_probe,
116 .fb_remove = intelfb_remove,
117 .fb_resize = intelfb_resize,
118 .master_create = i915_master_create,
119 .master_destroy = i915_master_destroy,
120#if defined(CONFIG_DEBUG_FS)
121 .debugfs_init = i915_debugfs_init,
122 .debugfs_cleanup = i915_debugfs_cleanup,
123#endif
124 .gem_init_object = i915_gem_init_object,
125 .gem_free_object = i915_gem_free_object,
126 .gem_vm_ops = &amp;i915_gem_vm_ops,
127 .ioctls = i915_ioctls,
128 .fops = {
129 .owner = THIS_MODULE,
130 .open = drm_open,
131 .release = drm_release,
132 .ioctl = drm_ioctl,
133 .mmap = drm_mmap,
134 .poll = drm_poll,
135 .fasync = drm_fasync,
136#ifdef CONFIG_COMPAT
137 .compat_ioctl = i915_compat_ioctl,
138#endif
139 },
140 .pci_driver = {
141 .name = DRIVER_NAME,
142 .id_table = pciidlist,
143 .probe = probe,
144 .remove = __devexit_p(drm_cleanup_pci),
145 },
146 .name = DRIVER_NAME,
147 .desc = DRIVER_DESC,
148 .date = DRIVER_DATE,
149 .major = DRIVER_MAJOR,
150 .minor = DRIVER_MINOR,
151 .patchlevel = DRIVER_PATCHLEVEL,
152 };
153 </programlisting>
154 <para>
155 In the example above, taken from the i915 DRM driver, the driver
156 sets several flags indicating what core features it supports.
157 We'll go over the individual callbacks in later sections. Since
158 flags indicate which features your driver supports to the DRM
159 core, you need to set most of them prior to calling drm_init(). Some,
160 like DRIVER_MODESET can be set later based on user supplied parameters,
161 but that's the exception rather than the rule.
162 </para>
163 <variablelist>
164 <title>Driver flags</title>
165 <varlistentry>
166 <term>DRIVER_USE_AGP</term>
167 <listitem><para>
168 Driver uses AGP interface
169 </para></listitem>
170 </varlistentry>
171 <varlistentry>
172 <term>DRIVER_REQUIRE_AGP</term>
173 <listitem><para>
174 Driver needs AGP interface to function.
175 </para></listitem>
176 </varlistentry>
177 <varlistentry>
178 <term>DRIVER_USE_MTRR</term>
179 <listitem>
180 <para>
181 Driver uses MTRR interface for mapping memory. Deprecated.
182 </para>
183 </listitem>
184 </varlistentry>
185 <varlistentry>
186 <term>DRIVER_PCI_DMA</term>
187 <listitem><para>
188 Driver is capable of PCI DMA. Deprecated.
189 </para></listitem>
190 </varlistentry>
191 <varlistentry>
192 <term>DRIVER_SG</term>
193 <listitem><para>
194 Driver can perform scatter/gather DMA. Deprecated.
195 </para></listitem>
196 </varlistentry>
197 <varlistentry>
198 <term>DRIVER_HAVE_DMA</term>
199 <listitem><para>Driver supports DMA. Deprecated.</para></listitem>
200 </varlistentry>
201 <varlistentry>
202 <term>DRIVER_HAVE_IRQ</term><term>DRIVER_IRQ_SHARED</term>
203 <listitem>
204 <para>
205 DRIVER_HAVE_IRQ indicates whether the driver has a IRQ
206 handler, DRIVER_IRQ_SHARED indicates whether the device &amp;
207 handler support shared IRQs (note that this is required of
208 PCI drivers).
209 </para>
210 </listitem>
211 </varlistentry>
212 <varlistentry>
213 <term>DRIVER_DMA_QUEUE</term>
214 <listitem>
215 <para>
216 If the driver queues DMA requests and completes them
217 asynchronously, this flag should be set. Deprecated.
218 </para>
219 </listitem>
220 </varlistentry>
221 <varlistentry>
222 <term>DRIVER_FB_DMA</term>
223 <listitem>
224 <para>
225 Driver supports DMA to/from the framebuffer. Deprecated.
226 </para>
227 </listitem>
228 </varlistentry>
229 <varlistentry>
230 <term>DRIVER_MODESET</term>
231 <listitem>
232 <para>
233 Driver supports mode setting interfaces.
234 </para>
235 </listitem>
236 </varlistentry>
237 </variablelist>
238 <para>
239 In this specific case, the driver requires AGP and supports
240 IRQs. DMA, as we'll see, is handled by device specific ioctls
241 in this case. It also supports the kernel mode setting APIs, though
242 unlike in the actual i915 driver source, this example unconditionally
243 exports KMS capability.
244 </para>
245 </sect1>
246
247 <!-- Internals: driver load -->
248
249 <sect1>
250 <title>Driver load</title>
251 <para>
252 In the previous section, we saw what a typical drm_driver
253 structure might look like. One of the more important fields in
254 the structure is the hook for the load function.
255 </para>
256 <programlisting>
257 static struct drm_driver driver = {
258 ...
259 .load = i915_driver_load,
260 ...
261 };
262 </programlisting>
263 <para>
264 The load function has many responsibilities: allocating a driver
265 private structure, specifying supported performance counters,
266 configuring the device (e.g. mapping registers &amp; command
267 buffers), initializing the memory manager, and setting up the
268 initial output configuration.
269 </para>
270 <para>
271 Note that the tasks performed at driver load time must not
272 conflict with DRM client requirements. For instance, if user
273 level mode setting drivers are in use, it would be problematic
274 to perform output discovery &amp; configuration at load time.
275 Likewise, if pre-memory management aware user level drivers are
276 in use, memory management and command buffer setup may need to
277 be omitted. These requirements are driver specific, and care
278 needs to be taken to keep both old and new applications and
279 libraries working. The i915 driver supports the "modeset"
280 module parameter to control whether advanced features are
281 enabled at load time or in legacy fashion. If compatibility is
282 a concern (e.g. with drivers converted over to the new interfaces
283 from the old ones), care must be taken to prevent incompatible
284 device initialization and control with the currently active
285 userspace drivers.
286 </para>
287
288 <sect2>
289 <title>Driver private &amp; performance counters</title>
290 <para>
291 The driver private hangs off the main drm_device structure and
292 can be used for tracking various device specific bits of
293 information, like register offsets, command buffer status,
294 register state for suspend/resume, etc. At load time, a
295 driver can simply allocate one and set drm_device.dev_priv
296 appropriately; at unload the driver can free it and set
297 drm_device.dev_priv to NULL.
298 </para>
299 <para>
300 The DRM supports several counters which can be used for rough
301 performance characterization. Note that the DRM stat counter
302 system is not often used by applications, and supporting
303 additional counters is completely optional.
304 </para>
305 <para>
306 These interfaces are deprecated and should not be used. If performance
307 monitoring is desired, the developer should investigate and
308 potentially enhance the kernel perf and tracing infrastructure to export
309 GPU related performance information to performance monitoring
310 tools and applications.
311 </para>
312 </sect2>
313
314 <sect2>
315 <title>Configuring the device</title>
316 <para>
317 Obviously, device configuration will be device specific.
318 However, there are several common operations: finding a
319 device's PCI resources, mapping them, and potentially setting
320 up an IRQ handler.
321 </para>
322 <para>
323 Finding &amp; mapping resources is fairly straightforward. The
324 DRM wrapper functions, drm_get_resource_start() and
325 drm_get_resource_len() can be used to find BARs on the given
326 drm_device struct. Once those values have been retrieved, the
327 driver load function can call drm_addmap() to create a new
328 mapping for the BAR in question. Note you'll probably want a
329 drm_local_map_t in your driver private structure to track any
330 mappings you create.
331<!-- !Fdrivers/gpu/drm/drm_bufs.c drm_get_resource_* -->
332<!-- !Finclude/drm/drmP.h drm_local_map_t -->
333 </para>
334 <para>
335 if compatibility with other operating systems isn't a concern
336 (DRM drivers can run under various BSD variants and OpenSolaris),
337 native Linux calls can be used for the above, e.g. pci_resource_*
338 and iomap*/iounmap. See the Linux device driver book for more
339 info.
340 </para>
341 <para>
342 Once you have a register map, you can use the DRM_READn() and
343 DRM_WRITEn() macros to access the registers on your device, or
344 use driver specific versions to offset into your MMIO space
345 relative to a driver specific base pointer (see I915_READ for
346 example).
347 </para>
348 <para>
349 If your device supports interrupt generation, you may want to
350 setup an interrupt handler at driver load time as well. This
351 is done using the drm_irq_install() function. If your device
352 supports vertical blank interrupts, it should call
353 drm_vblank_init() to initialize the core vblank handling code before
354 enabling interrupts on your device. This ensures the vblank related
355 structures are allocated and allows the core to handle vblank events.
356 </para>
357<!--!Fdrivers/char/drm/drm_irq.c drm_irq_install-->
358 <para>
359 Once your interrupt handler is registered (it'll use your
360 drm_driver.irq_handler as the actual interrupt handling
361 function), you can safely enable interrupts on your device,
362 assuming any other state your interrupt handler uses is also
363 initialized.
364 </para>
365 <para>
366 Another task that may be necessary during configuration is
367 mapping the video BIOS. On many devices, the VBIOS describes
368 device configuration, LCD panel timings (if any), and contains
369 flags indicating device state. Mapping the BIOS can be done
370 using the pci_map_rom() call, a convenience function that
371 takes care of mapping the actual ROM, whether it has been
372 shadowed into memory (typically at address 0xc0000) or exists
373 on the PCI device in the ROM BAR. Note that once you've
374 mapped the ROM and extracted any necessary information, be
375 sure to unmap it; on many devices the ROM address decoder is
376 shared with other BARs, so leaving it mapped can cause
377 undesired behavior like hangs or memory corruption.
378<!--!Fdrivers/pci/rom.c pci_map_rom-->
379 </para>
380 </sect2>
381
382 <sect2>
383 <title>Memory manager initialization</title>
384 <para>
385 In order to allocate command buffers, cursor memory, scanout
386 buffers, etc., as well as support the latest features provided
387 by packages like Mesa and the X.Org X server, your driver
388 should support a memory manager.
389 </para>
390 <para>
391 If your driver supports memory management (it should!), you'll
392 need to set that up at load time as well. How you intialize
393 it depends on which memory manager you're using, TTM or GEM.
394 </para>
395 <sect3>
396 <title>TTM initialization</title>
397 <para>
398 TTM (for Translation Table Manager) manages video memory and
399 aperture space for graphics devices. TTM supports both UMA devices
400 and devices with dedicated video RAM (VRAM), i.e. most discrete
401 graphics devices. If your device has dedicated RAM, supporting
402 TTM is desireable. TTM also integrates tightly with your
403 driver specific buffer execution function. See the radeon
404 driver for examples.
405 </para>
406 <para>
407 The core TTM structure is the ttm_bo_driver struct. It contains
408 several fields with function pointers for initializing the TTM,
409 allocating and freeing memory, waiting for command completion
410 and fence synchronization, and memory migration. See the
411 radeon_ttm.c file for an example of usage.
412 </para>
413 <para>
414 The ttm_global_reference structure is made up of several fields:
415 </para>
416 <programlisting>
417 struct ttm_global_reference {
418 enum ttm_global_types global_type;
419 size_t size;
420 void *object;
421 int (*init) (struct ttm_global_reference *);
422 void (*release) (struct ttm_global_reference *);
423 };
424 </programlisting>
425 <para>
426 There should be one global reference structure for your memory
427 manager as a whole, and there will be others for each object
428 created by the memory manager at runtime. Your global TTM should
429 have a type of TTM_GLOBAL_TTM_MEM. The size field for the global
430 object should be sizeof(struct ttm_mem_global), and the init and
431 release hooks should point at your driver specific init and
432 release routines, which will probably eventually call
433 ttm_mem_global_init and ttm_mem_global_release respectively.
434 </para>
435 <para>
436 Once your global TTM accounting structure is set up and initialized
437 (done by calling ttm_global_item_ref on the global object you
438 just created), you'll need to create a buffer object TTM to
439 provide a pool for buffer object allocation by clients and the
440 kernel itself. The type of this object should be TTM_GLOBAL_TTM_BO,
441 and its size should be sizeof(struct ttm_bo_global). Again,
442 driver specific init and release functions can be provided,
443 likely eventually calling ttm_bo_global_init and
444 ttm_bo_global_release, respectively. Also like the previous
445 object, ttm_global_item_ref is used to create an initial reference
446 count for the TTM, which will call your initalization function.
447 </para>
448 </sect3>
449 <sect3>
450 <title>GEM initialization</title>
451 <para>
452 GEM is an alternative to TTM, designed specifically for UMA
453 devices. It has simpler initialization and execution requirements
454 than TTM, but has no VRAM management capability. Core GEM
455 initialization is comprised of a basic drm_mm_init call to create
456 a GTT DRM MM object, which provides an address space pool for
457 object allocation. In a KMS configuration, the driver will
458 need to allocate and initialize a command ring buffer following
459 basic GEM initialization. Most UMA devices have a so-called
460 "stolen" memory region, which provides space for the initial
461 framebuffer and large, contiguous memory regions required by the
462 device. This space is not typically managed by GEM, and must
463 be initialized separately into its own DRM MM object.
464 </para>
465 <para>
466 Initialization will be driver specific, and will depend on
467 the architecture of the device. In the case of Intel
468 integrated graphics chips like 965GM, GEM initialization can
469 be done by calling the internal GEM init function,
470 i915_gem_do_init(). Since the 965GM is a UMA device
471 (i.e. it doesn't have dedicated VRAM), GEM will manage
472 making regular RAM available for GPU operations. Memory set
473 aside by the BIOS (called "stolen" memory by the i915
474 driver) will be managed by the DRM memrange allocator; the
475 rest of the aperture will be managed by GEM.
476 <programlisting>
477 /* Basic memrange allocator for stolen space (aka vram) */
478 drm_memrange_init(&amp;dev_priv->vram, 0, prealloc_size);
479 /* Let GEM Manage from end of prealloc space to end of aperture */
480 i915_gem_do_init(dev, prealloc_size, agp_size);
481 </programlisting>
482<!--!Edrivers/char/drm/drm_memrange.c-->
483 </para>
484 <para>
485 Once the memory manager has been set up, we can allocate the
486 command buffer. In the i915 case, this is also done with a
487 GEM function, i915_gem_init_ringbuffer().
488 </para>
489 </sect3>
490 </sect2>
491
492 <sect2>
493 <title>Output configuration</title>
494 <para>
495 The final initialization task is output configuration. This involves
496 finding and initializing the CRTCs, encoders and connectors
497 for your device, creating an initial configuration and
498 registering a framebuffer console driver.
499 </para>
500 <sect3>
501 <title>Output discovery and initialization</title>
502 <para>
503 Several core functions exist to create CRTCs, encoders and
504 connectors, namely drm_crtc_init(), drm_connector_init() and
505 drm_encoder_init(), along with several "helper" functions to
506 perform common tasks.
507 </para>
508 <para>
509 Connectors should be registered with sysfs once they've been
510 detected and initialized, using the
511 drm_sysfs_connector_add() function. Likewise, when they're
512 removed from the system, they should be destroyed with
513 drm_sysfs_connector_remove().
514 </para>
515 <programlisting>
516<![CDATA[
517void intel_crt_init(struct drm_device *dev)
518{
519 struct drm_connector *connector;
520 struct intel_output *intel_output;
521
522 intel_output = kzalloc(sizeof(struct intel_output), GFP_KERNEL);
523 if (!intel_output)
524 return;
525
526 connector = &intel_output->base;
527 drm_connector_init(dev, &intel_output->base,
528 &intel_crt_connector_funcs, DRM_MODE_CONNECTOR_VGA);
529
530 drm_encoder_init(dev, &intel_output->enc, &intel_crt_enc_funcs,
531 DRM_MODE_ENCODER_DAC);
532
533 drm_mode_connector_attach_encoder(&intel_output->base,
534 &intel_output->enc);
535
536 /* Set up the DDC bus. */
537 intel_output->ddc_bus = intel_i2c_create(dev, GPIOA, "CRTDDC_A");
538 if (!intel_output->ddc_bus) {
539 dev_printk(KERN_ERR, &dev->pdev->dev, "DDC bus registration "
540 "failed.\n");
541 return;
542 }
543
544 intel_output->type = INTEL_OUTPUT_ANALOG;
545 connector->interlace_allowed = 0;
546 connector->doublescan_allowed = 0;
547
548 drm_encoder_helper_add(&intel_output->enc, &intel_crt_helper_funcs);
549 drm_connector_helper_add(connector, &intel_crt_connector_helper_funcs);
550
551 drm_sysfs_connector_add(connector);
552}
553]]>
554 </programlisting>
555 <para>
556 In the example above (again, taken from the i915 driver), a
557 CRT connector and encoder combination is created. A device
558 specific i2c bus is also created, for fetching EDID data and
559 performing monitor detection. Once the process is complete,
560 the new connector is regsitered with sysfs, to make its
561 properties available to applications.
562 </para>
563 <sect4>
564 <title>Helper functions and core functions</title>
565 <para>
566 Since many PC-class graphics devices have similar display output
567 designs, the DRM provides a set of helper functions to make
568 output management easier. The core helper routines handle
569 encoder re-routing and disabling of unused functions following
570 mode set. Using the helpers is optional, but recommended for
571 devices with PC-style architectures (i.e. a set of display planes
572 for feeding pixels to encoders which are in turn routed to
573 connectors). Devices with more complex requirements needing
574 finer grained management can opt to use the core callbacks
575 directly.
576 </para>
577 <para>
578 [Insert typical diagram here.] [Insert OMAP style config here.]
579 </para>
580 </sect4>
581 <para>
582 For each encoder, CRTC and connector, several functions must
583 be provided, depending on the object type. Encoder objects
584 need should provide a DPMS (basically on/off) function, mode fixup
585 (for converting requested modes into native hardware timings),
586 and prepare, set and commit functions for use by the core DRM
587 helper functions. Connector helpers need to provide mode fetch and
588 validity functions as well as an encoder matching function for
589 returing an ideal encoder for a given connector. The core
590 connector functions include a DPMS callback, (deprecated)
591 save/restore routines, detection, mode probing, property handling,
592 and cleanup functions.
593 </para>
594<!--!Edrivers/char/drm/drm_crtc.h-->
595<!--!Edrivers/char/drm/drm_crtc.c-->
596<!--!Edrivers/char/drm/drm_crtc_helper.c-->
597 </sect3>
598 </sect2>
599 </sect1>
600
601 <!-- Internals: vblank handling -->
602
603 <sect1>
604 <title>VBlank event handling</title>
605 <para>
606 The DRM core exposes two vertical blank related ioctls:
607 DRM_IOCTL_WAIT_VBLANK and DRM_IOCTL_MODESET_CTL.
608<!--!Edrivers/char/drm/drm_irq.c-->
609 </para>
610 <para>
611 DRM_IOCTL_WAIT_VBLANK takes a struct drm_wait_vblank structure
612 as its argument, and is used to block or request a signal when a
613 specified vblank event occurs.
614 </para>
615 <para>
616 DRM_IOCTL_MODESET_CTL should be called by application level
617 drivers before and after mode setting, since on many devices the
618 vertical blank counter will be reset at that time. Internally,
619 the DRM snapshots the last vblank count when the ioctl is called
620 with the _DRM_PRE_MODESET command so that the counter won't go
621 backwards (which is dealt with when _DRM_POST_MODESET is used).
622 </para>
623 <para>
624 To support the functions above, the DRM core provides several
625 helper functions for tracking vertical blank counters, and
626 requires drivers to provide several callbacks:
627 get_vblank_counter(), enable_vblank() and disable_vblank(). The
628 core uses get_vblank_counter() to keep the counter accurate
629 across interrupt disable periods. It should return the current
630 vertical blank event count, which is often tracked in a device
631 register. The enable and disable vblank callbacks should enable
632 and disable vertical blank interrupts, respectively. In the
633 absence of DRM clients waiting on vblank events, the core DRM
634 code will use the disable_vblank() function to disable
635 interrupts, which saves power. They'll be re-enabled again when
636 a client calls the vblank wait ioctl above.
637 </para>
638 <para>
639 Devices that don't provide a count register can simply use an
640 internal atomic counter incremented on every vertical blank
641 interrupt, and can make their enable and disable vblank
642 functions into no-ops.
643 </para>
644 </sect1>
645
646 <sect1>
647 <title>Memory management</title>
648 <para>
649 The memory manager lies at the heart of many DRM operations, and
650 is also required to support advanced client features like OpenGL
651 pbuffers. The DRM currently contains two memory managers, TTM
652 and GEM.
653 </para>
654
655 <sect2>
656 <title>The Translation Table Manager (TTM)</title>
657 <para>
658 TTM was developed by Tungsten Graphics, primarily by Thomas
659 Hellström, and is intended to be a flexible, high performance
660 graphics memory manager.
661 </para>
662 <para>
663 Drivers wishing to support TTM must fill out a drm_bo_driver
664 structure.
665 </para>
666 <para>
667 TTM design background and information belongs here.
668 </para>
669 </sect2>
670
671 <sect2>
672 <title>The Graphics Execution Manager (GEM)</title>
673 <para>
674 GEM is an Intel project, authored by Eric Anholt and Keith
675 Packard. It provides simpler interfaces than TTM, and is well
676 suited for UMA devices.
677 </para>
678 <para>
679 GEM-enabled drivers must provide gem_init_object() and
680 gem_free_object() callbacks to support the core memory
681 allocation routines. They should also provide several driver
682 specific ioctls to support command execution, pinning, buffer
683 read &amp; write, mapping, and domain ownership transfers.
684 </para>
685 <para>
686 On a fundamental level, GEM involves several operations: memory
687 allocation and freeing, command execution, and aperture management
688 at command execution time. Buffer object allocation is relatively
689 straightforward and largely provided by Linux's shmem layer, which
690 provides memory to back each object. When mapped into the GTT
691 or used in a command buffer, the backing pages for an object are
692 flushed to memory and marked write combined so as to be coherent
693 with the GPU. Likewise, when the GPU finishes rendering to an object,
694 if the CPU accesses it, it must be made coherent with the CPU's view
695 of memory, usually involving GPU cache flushing of various kinds.
696 This core CPU&lt;-&gt;GPU coherency management is provided by the GEM
697 set domain function, which evaluates an object's current domain and
698 performs any necessary flushing or synchronization to put the object
699 into the desired coherency domain (note that the object may be busy,
700 i.e. an active render target; in that case the set domain function
701 will block the client and wait for rendering to complete before
702 performing any necessary flushing operations).
703 </para>
704 <para>
705 Perhaps the most important GEM function is providing a command
706 execution interface to clients. Client programs construct command
707 buffers containing references to previously allocated memory objects
708 and submit them to GEM. At that point, GEM will take care to bind
709 all the objects into the GTT, execute the buffer, and provide
710 necessary synchronization between clients accessing the same buffers.
711 This often involves evicting some objects from the GTT and re-binding
712 others (a fairly expensive operation), and providing relocation
713 support which hides fixed GTT offsets from clients. Clients must
714 take care not to submit command buffers that reference more objects
715 than can fit in the GTT or GEM will reject them and no rendering
716 will occur. Similarly, if several objects in the buffer require
717 fence registers to be allocated for correct rendering (e.g. 2D blits
718 on pre-965 chips), care must be taken not to require more fence
719 registers than are available to the client. Such resource management
720 should be abstracted from the client in libdrm.
721 </para>
722 </sect2>
723
724 </sect1>
725
726 <!-- Output management -->
727 <sect1>
728 <title>Output management</title>
729 <para>
730 At the core of the DRM output management code is a set of
731 structures representing CRTCs, encoders and connectors.
732 </para>
733 <para>
734 A CRTC is an abstraction representing a part of the chip that
735 contains a pointer to a scanout buffer. Therefore, the number
736 of CRTCs available determines how many independent scanout
737 buffers can be active at any given time. The CRTC structure
738 contains several fields to support this: a pointer to some video
739 memory, a display mode, and an (x, y) offset into the video
740 memory to support panning or configurations where one piece of
741 video memory spans multiple CRTCs.
742 </para>
743 <para>
744 An encoder takes pixel data from a CRTC and converts it to a
745 format suitable for any attached connectors. On some devices,
746 it may be possible to have a CRTC send data to more than one
747 encoder. In that case, both encoders would receive data from
748 the same scanout buffer, resulting in a "cloned" display
749 configuration across the connectors attached to each encoder.
750 </para>
751 <para>
752 A connector is the final destination for pixel data on a device,
753 and usually connects directly to an external display device like
754 a monitor or laptop panel. A connector can only be attached to
755 one encoder at a time. The connector is also the structure
756 where information about the attached display is kept, so it
757 contains fields for display data, EDID data, DPMS &amp;
758 connection status, and information about modes supported on the
759 attached displays.
760 </para>
761<!--!Edrivers/char/drm/drm_crtc.c-->
762 </sect1>
763
764 <sect1>
765 <title>Framebuffer management</title>
766 <para>
767 In order to set a mode on a given CRTC, encoder and connector
768 configuration, clients need to provide a framebuffer object which
769 will provide a source of pixels for the CRTC to deliver to the encoder(s)
770 and ultimately the connector(s) in the configuration. A framebuffer
771 is fundamentally a driver specific memory object, made into an opaque
772 handle by the DRM addfb function. Once an fb has been created this
773 way it can be passed to the KMS mode setting routines for use in
774 a configuration.
775 </para>
776 </sect1>
777
778 <sect1>
779 <title>Command submission &amp; fencing</title>
780 <para>
781 This should cover a few device specific command submission
782 implementations.
783 </para>
784 </sect1>
785
786 <sect1>
787 <title>Suspend/resume</title>
788 <para>
789 The DRM core provides some suspend/resume code, but drivers
790 wanting full suspend/resume support should provide save() and
791 restore() functions. These will be called at suspend,
792 hibernate, or resume time, and should perform any state save or
793 restore required by your device across suspend or hibernate
794 states.
795 </para>
796 </sect1>
797
798 <sect1>
799 <title>DMA services</title>
800 <para>
801 This should cover how DMA mapping etc. is supported by the core.
802 These functions are deprecated and should not be used.
803 </para>
804 </sect1>
805 </chapter>
806
807 <!-- External interfaces -->
808
809 <chapter id="drmExternals">
810 <title>Userland interfaces</title>
811 <para>
812 The DRM core exports several interfaces to applications,
813 generally intended to be used through corresponding libdrm
814 wrapper functions. In addition, drivers export device specific
815 interfaces for use by userspace drivers &amp; device aware
816 applications through ioctls and sysfs files.
817 </para>
818 <para>
819 External interfaces include: memory mapping, context management,
820 DMA operations, AGP management, vblank control, fence
821 management, memory management, and output management.
822 </para>
823 <para>
824 Cover generic ioctls and sysfs layout here. Only need high
825 level info, since man pages will cover the rest.
826 </para>
827 </chapter>
828
829 <!-- API reference -->
830
831 <appendix id="drmDriverApi">
832 <title>DRM Driver API</title>
833 <para>
834 Include auto-generated API reference here (need to reference it
835 from paragraphs above too).
836 </para>
837 </appendix>
838
839</book>
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index 5cff41a5fa7c..55f12ac37acd 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -4,7 +4,7 @@
4 4
5<book id="kgdbOnLinux"> 5<book id="kgdbOnLinux">
6 <bookinfo> 6 <bookinfo>
7 <title>Using kgdb and the kgdb Internals</title> 7 <title>Using kgdb, kdb and the kernel debugger internals</title>
8 8
9 <authorgroup> 9 <authorgroup>
10 <author> 10 <author>
@@ -17,33 +17,8 @@
17 </affiliation> 17 </affiliation>
18 </author> 18 </author>
19 </authorgroup> 19 </authorgroup>
20
21 <authorgroup>
22 <author>
23 <firstname>Tom</firstname>
24 <surname>Rini</surname>
25 <affiliation>
26 <address>
27 <email>trini@kernel.crashing.org</email>
28 </address>
29 </affiliation>
30 </author>
31 </authorgroup>
32
33 <authorgroup>
34 <author>
35 <firstname>Amit S.</firstname>
36 <surname>Kale</surname>
37 <affiliation>
38 <address>
39 <email>amitkale@linsyssoft.com</email>
40 </address>
41 </affiliation>
42 </author>
43 </authorgroup>
44
45 <copyright> 20 <copyright>
46 <year>2008</year> 21 <year>2008,2010</year>
47 <holder>Wind River Systems, Inc.</holder> 22 <holder>Wind River Systems, Inc.</holder>
48 </copyright> 23 </copyright>
49 <copyright> 24 <copyright>
@@ -69,41 +44,76 @@
69 <chapter id="Introduction"> 44 <chapter id="Introduction">
70 <title>Introduction</title> 45 <title>Introduction</title>
71 <para> 46 <para>
72 kgdb is a source level debugger for linux kernel. It is used along 47 The kernel has two different debugger front ends (kdb and kgdb)
73 with gdb to debug a linux kernel. The expectation is that gdb can 48 which interface to the debug core. It is possible to use either
74 be used to "break in" to the kernel to inspect memory, variables 49 of the debugger front ends and dynamically transition between them
75 and look through call stack information similar to what an 50 if you configure the kernel properly at compile and runtime.
76 application developer would use gdb for. It is possible to place 51 </para>
77 breakpoints in kernel code and perform some limited execution 52 <para>
78 stepping. 53 Kdb is simplistic shell-style interface which you can use on a
54 system console with a keyboard or serial console. You can use it
55 to inspect memory, registers, process lists, dmesg, and even set
56 breakpoints to stop in a certain location. Kdb is not a source
57 level debugger, although you can set breakpoints and execute some
58 basic kernel run control. Kdb is mainly aimed at doing some
59 analysis to aid in development or diagnosing kernel problems. You
60 can access some symbols by name in kernel built-ins or in kernel
61 modules if the code was built
62 with <symbol>CONFIG_KALLSYMS</symbol>.
63 </para>
64 <para>
65 Kgdb is intended to be used as a source level debugger for the
66 Linux kernel. It is used along with gdb to debug a Linux kernel.
67 The expectation is that gdb can be used to "break in" to the
68 kernel to inspect memory, variables and look through call stack
69 information similar to the way an application developer would use
70 gdb to debug an application. It is possible to place breakpoints
71 in kernel code and perform some limited execution stepping.
79 </para> 72 </para>
80 <para> 73 <para>
81 Two machines are required for using kgdb. One of these machines is a 74 Two machines are required for using kgdb. One of these machines is
82 development machine and the other is a test machine. The kernel 75 a development machine and the other is the target machine. The
83 to be debugged runs on the test machine. The development machine 76 kernel to be debugged runs on the target machine. The development
84 runs an instance of gdb against the vmlinux file which contains 77 machine runs an instance of gdb against the vmlinux file which
85 the symbols (not boot image such as bzImage, zImage, uImage...). 78 contains the symbols (not boot image such as bzImage, zImage,
86 In gdb the developer specifies the connection parameters and 79 uImage...). In gdb the developer specifies the connection
87 connects to kgdb. The type of connection a developer makes with 80 parameters and connects to kgdb. The type of connection a
88 gdb depends on the availability of kgdb I/O modules compiled as 81 developer makes with gdb depends on the availability of kgdb I/O
89 builtin's or kernel modules in the test machine's kernel. 82 modules compiled as built-ins or loadable kernel modules in the test
83 machine's kernel.
90 </para> 84 </para>
91 </chapter> 85 </chapter>
92 <chapter id="CompilingAKernel"> 86 <chapter id="CompilingAKernel">
93 <title>Compiling a kernel</title> 87 <title>Compiling a kernel</title>
88 <para>
89 <itemizedlist>
90 <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem>
91 <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem>
92 </itemizedlist>
93 </para>
94 <sect1 id="CompileKGDB">
95 <title>Kernel config options for kgdb</title>
94 <para> 96 <para>
95 To enable <symbol>CONFIG_KGDB</symbol> you should first turn on 97 To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
96 "Prompt for development and/or incomplete code/drivers" 98 "Prompt for development and/or incomplete code/drivers"
97 (CONFIG_EXPERIMENTAL) in "General setup", then under the 99 (CONFIG_EXPERIMENTAL) in "General setup", then under the
98 "Kernel debugging" select "KGDB: kernel debugging with remote gdb". 100 "Kernel debugging" select "KGDB: kernel debugger".
101 </para>
102 <para>
103 While it is not a hard requirement that you have symbols in your
104 vmlinux file, gdb tends not to be very useful without the symbolic
105 data, so you will want to turn
106 on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the
107 kernel with debug info" in the config menu.
99 </para> 108 </para>
100 <para> 109 <para>
101 It is advised, but not required that you turn on the 110 It is advised, but not required that you turn on the
102 CONFIG_FRAME_POINTER kernel option. This option inserts code to 111 <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the
103 into the compiled executable which saves the frame information in 112 kernel with frame pointers" in the config menu. This option
104 registers or on the stack at different points which will allow a 113 inserts code to into the compiled executable which saves the frame
105 debugger such as gdb to more accurately construct stack back traces 114 information in registers or on the stack at different points which
106 while debugging the kernel. 115 allows a debugger such as gdb to more accurately construct
116 stack back traces while debugging the kernel.
107 </para> 117 </para>
108 <para> 118 <para>
109 If the architecture that you are using supports the kernel option 119 If the architecture that you are using supports the kernel option
@@ -116,38 +126,160 @@
116 this option. 126 this option.
117 </para> 127 </para>
118 <para> 128 <para>
119 Next you should choose one of more I/O drivers to interconnect debugging 129 Next you should choose one of more I/O drivers to interconnect
120 host and debugged target. Early boot debugging requires a KGDB 130 debugging host and debugged target. Early boot debugging requires
121 I/O driver that supports early debugging and the driver must be 131 a KGDB I/O driver that supports early debugging and the driver
122 built into the kernel directly. Kgdb I/O driver configuration 132 must be built into the kernel directly. Kgdb I/O driver
123 takes place via kernel or module parameters, see following 133 configuration takes place via kernel or module parameters which
124 chapter. 134 you can learn more about in the in the section that describes the
135 parameter "kgdboc".
125 </para> 136 </para>
126 <para> 137 <para>Here is an example set of .config symbols to enable or
127 The kgdb test compile options are described in the kgdb test suite chapter. 138 disable for kgdb:
139 <itemizedlist>
140 <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
141 <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
142 <listitem><para>CONFIG_KGDB=y</para></listitem>
143 <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
144 </itemizedlist>
128 </para> 145 </para>
129 146 </sect1>
147 <sect1 id="CompileKDB">
148 <title>Kernel config options for kdb</title>
149 <para>Kdb is quite a bit more complex than the simple gdbstub
150 sitting on top of the kernel's debug core. Kdb must implement a
151 shell, and also adds some helper functions in other parts of the
152 kernel, responsible for printing out interesting data such as what
153 you would see if you ran "lsmod", or "ps". In order to build kdb
154 into the kernel you follow the same steps as you would for kgdb.
155 </para>
156 <para>The main config option for kdb
157 is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB:
158 include kdb frontend for kgdb" in the config menu. In theory you
159 would have already also selected an I/O driver such as the
160 CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
161 serial port, when you were configuring kgdb.
162 </para>
163 <para>If you want to use a PS/2-style keyboard with kdb, you would
164 select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
165 input device" in the config menu. The CONFIG_KDB_KEYBOARD option
166 is not used for anything in the gdb interface to kgdb. The
167 CONFIG_KDB_KEYBOARD option only works with kdb.
168 </para>
169 <para>Here is an example set of .config symbols to enable/disable kdb:
170 <itemizedlist>
171 <listitem><para># CONFIG_DEBUG_RODATA is not set</para></listitem>
172 <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
173 <listitem><para>CONFIG_KGDB=y</para></listitem>
174 <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
175 <listitem><para>CONFIG_KGDB_KDB=y</para></listitem>
176 <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem>
177 </itemizedlist>
178 </para>
179 </sect1>
130 </chapter> 180 </chapter>
131 <chapter id="EnableKGDB"> 181 <chapter id="kgdbKernelArgs">
132 <title>Enable kgdb for debugging</title> 182 <title>Kernel Debugger Boot Arguments</title>
133 <para> 183 <para>This section describes the various runtime kernel
134 In order to use kgdb you must activate it by passing configuration 184 parameters that affect the configuration of the kernel debugger.
135 information to one of the kgdb I/O drivers. If you do not pass any 185 The following chapter covers using kdb and kgdb as well as
136 configuration information kgdb will not do anything at all. Kgdb 186 provides some examples of the configuration parameters.</para>
137 will only actively hook up to the kernel trap hooks if a kgdb I/O 187 <sect1 id="kgdboc">
138 driver is loaded and configured. If you unconfigure a kgdb I/O 188 <title>Kernel parameter: kgdboc</title>
139 driver, kgdb will unregister all the kernel hook points. 189 <para>The kgdboc driver was originally an abbreviation meant to
190 stand for "kgdb over console". Today it is the primary mechanism
191 to configure how to communicate from gdb to kgdb as well as the
192 devices you want to use to interact with the kdb shell.
193 </para>
194 <para>For kgdb/gdb, kgdboc is designed to work with a single serial
195 port. It is intended to cover the circumstance where you want to
196 use a serial console as your primary console as well as using it to
197 perform kernel debugging. It is also possible to use kgdb on a
198 serial port which is not designated as a system console. Kgdboc
199 may be configured as a kernel built-in or a kernel loadable module.
200 You can only make use of <constant>kgdbwait</constant> and early
201 debugging if you build kgdboc into the kernel as a built-in.
140 </para> 202 </para>
203 <sect2 id="kgdbocArgs">
204 <title>kgdboc arguments</title>
205 <para>Usage: <constant>kgdboc=[kbd][[,]serial_device][,baud]</constant></para>
206 <sect3 id="kgdbocArgs1">
207 <title>Using loadable module or built-in</title>
141 <para> 208 <para>
142 All drivers can be reconfigured at run time, if 209 <orderedlist>
143 <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol> 210 <listitem><para>As a kernel built-in:</para>
144 are enabled, by echo'ing a new config string to 211 <para>Use the kernel boot argument: <constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para></listitem>
145 <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>. 212 <listitem>
146 The driver can be unconfigured by passing an empty string. You cannot 213 <para>As a kernel loadable module:</para>
147 change the configuration while the debugger is attached. Make sure 214 <para>Use the command: <constant>modprobe kgdboc kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
148 to detach the debugger with the <constant>detach</constant> command 215 <para>Here are two examples of how you might formate the kgdboc
149 prior to trying unconfigure a kgdb I/O driver. 216 string. The first is for an x86 target using the first serial port.
217 The second example is for the ARM Versatile AB using the second
218 serial port.
219 <orderedlist>
220 <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
221 <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem>
222 </orderedlist>
150 </para> 223 </para>
224 </listitem>
225 </orderedlist></para>
226 </sect3>
227 <sect3 id="kgdbocArgs2">
228 <title>Configure kgdboc at runtime with sysfs</title>
229 <para>At run time you can enable or disable kgdboc by echoing a
230 parameters into the sysfs. Here are two examples:</para>
231 <orderedlist>
232 <listitem><para>Enable kgdboc on ttyS0</para>
233 <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
234 <listitem><para>Disable kgdboc</para>
235 <para><constant>echo "" &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
236 </orderedlist>
237 <para>NOTE: You do not need to specify the baud if you are
238 configuring the console on tty which is already configured or
239 open.</para>
240 </sect3>
241 <sect3 id="kgdbocArgs3">
242 <title>More examples</title>
243 <para>You can configure kgdboc to use the keyboard, and or a serial device
244 depending on if you are using kdb and or kgdb, in one of the
245 following scenarios.
246 <orderedlist>
247 <listitem><para>kdb and kgdb over only a serial port</para>
248 <para><constant>kgdboc=&lt;serial_device&gt;[,baud]</constant></para>
249 <para>Example: <constant>kgdboc=ttyS0,115200</constant></para>
250 </listitem>
251 <listitem><para>kdb and kgdb with keyboard and a serial port</para>
252 <para><constant>kgdboc=kbd,&lt;serial_device&gt;[,baud]</constant></para>
253 <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para>
254 </listitem>
255 <listitem><para>kdb with a keyboard</para>
256 <para><constant>kgdboc=kbd</constant></para>
257 </listitem>
258 </orderedlist>
259 </para>
260 </sect3>
261 <para>NOTE: Kgdboc does not support interrupting the target via the
262 gdb remote protocol. You must manually send a sysrq-g unless you
263 have a proxy that splits console output to a terminal program.
264 A console proxy has a separate TCP port for the debugger and a separate
265 TCP port for the "human" console. The proxy can take care of sending
266 the sysrq-g for you.
267 </para>
268 <para>When using kgdboc with no debugger proxy, you can end up
269 connecting the debugger at one of two entry points. If an
270 exception occurs after you have loaded kgdboc, a message should
271 print on the console stating it is waiting for the debugger. In
272 this case you disconnect your terminal program and then connect the
273 debugger in its place. If you want to interrupt the target system
274 and forcibly enter a debug session you have to issue a Sysrq
275 sequence and then type the letter <constant>g</constant>. Then
276 you disconnect the terminal session and connect gdb. Your options
277 if you don't like this are to hack gdb to send the sysrq-g for you
278 as well as on the initial connect, or to use a debugger proxy that
279 allows an unmodified gdb to do the debugging.
280 </para>
281 </sect2>
282 </sect1>
151 <sect1 id="kgdbwait"> 283 <sect1 id="kgdbwait">
152 <title>Kernel parameter: kgdbwait</title> 284 <title>Kernel parameter: kgdbwait</title>
153 <para> 285 <para>
@@ -162,103 +294,204 @@
162 </para> 294 </para>
163 <para> 295 <para>
164 The kernel will stop and wait as early as the I/O driver and 296 The kernel will stop and wait as early as the I/O driver and
165 architecture will allow when you use this option. If you build the 297 architecture allows when you use this option. If you build the
166 kgdb I/O driver as a kernel module kgdbwait will not do anything. 298 kgdb I/O driver as a loadable kernel module kgdbwait will not do
299 anything.
167 </para> 300 </para>
168 </sect1> 301 </sect1>
169 <sect1 id="kgdboc"> 302 <sect1 id="kgdbcon">
170 <title>Kernel parameter: kgdboc</title> 303 <title>Kernel parameter: kgdbcon</title>
171 <para> 304 <para> The kgdbcon feature allows you to see printk() messages
172 The kgdboc driver was originally an abbreviation meant to stand for 305 inside gdb while gdb is connected to the kernel. Kdb does not make
173 "kgdb over console". Kgdboc is designed to work with a single 306 use of the kgdbcon feature.
174 serial port. It was meant to cover the circumstance 307 </para>
175 where you wanted to use a serial console as your primary console as 308 <para>Kgdb supports using the gdb serial protocol to send console
176 well as using it to perform kernel debugging. Of course you can 309 messages to the debugger when the debugger is connected and running.
177 also use kgdboc without assigning a console to the same port. 310 There are two ways to activate this feature.
311 <orderedlist>
312 <listitem><para>Activate with the kernel command line option:</para>
313 <para><constant>kgdbcon</constant></para>
314 </listitem>
315 <listitem><para>Use sysfs before configuring an I/O driver</para>
316 <para>
317 <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant>
318 </para>
319 <para>
320 NOTE: If you do this after you configure the kgdb I/O driver, the
321 setting will not take effect until the next point the I/O is
322 reconfigured.
323 </para>
324 </listitem>
325 </orderedlist>
326 <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
327 active system console. An example incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant>
328 </para>
329 <para>It is possible to use this option with kgdboc on a tty that is not a system console.
330 </para>
178 </para> 331 </para>
179 <sect2 id="UsingKgdboc"> 332 </sect1>
180 <title>Using kgdboc</title> 333 </chapter>
181 <para> 334 <chapter id="usingKDB">
182 You can configure kgdboc via sysfs or a module or kernel boot line 335 <title>Using kdb</title>
183 parameter depending on if you build with CONFIG_KGDBOC as a module
184 or built-in.
185 <orderedlist>
186 <listitem><para>From the module load or build-in</para>
187 <para><constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
188 <para> 336 <para>
189 The example here would be if your console port was typically ttyS0, you would use something like <constant>kgdboc=ttyS0,115200</constant> or on the ARM Versatile AB you would likely use <constant>kgdboc=ttyAMA0,115200</constant> 337 </para>
338 <sect1 id="quickKDBserial">
339 <title>Quick start for kdb on a serial port</title>
340 <para>This is a quick example of how to use kdb.</para>
341 <para><orderedlist>
342 <listitem><para>Boot kernel with arguments:
343 <itemizedlist>
344 <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem>
345 </itemizedlist></para>
346 <para>OR</para>
347 <para>Configure kgdboc after the kernel booted; assuming you are using a serial port console:
348 <itemizedlist>
349 <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
350 </itemizedlist>
190 </para> 351 </para>
191 </listitem> 352 </listitem>
192 <listitem><para>From sysfs</para> 353 <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
193 <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para> 354 <itemizedlist>
355 <listitem><para>When logged in as root or with a super user session you can run:</para>
356 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
357 <listitem><para>Example using minicom 2.2</para>
358 <para>Press: <constant>Control-a</constant></para>
359 <para>Press: <constant>f</constant></para>
360 <para>Press: <constant>g</constant></para>
194 </listitem> 361 </listitem>
195 </orderedlist> 362 <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
196 </para> 363 <para>Press: <constant>Control-]</constant></para>
197 <para> 364 <para>Type in:<constant>send break</constant></para>
198 NOTE: Kgdboc does not support interrupting the target via the 365 <para>Press: <constant>Enter</constant></para>
199 gdb remote protocol. You must manually send a sysrq-g unless you 366 <para>Press: <constant>g</constant></para>
200 have a proxy that splits console output to a terminal problem and 367 </listitem>
201 has a separate port for the debugger to connect to that sends the 368 </itemizedlist>
202 sysrq-g for you. 369 </listitem>
370 <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para>
371 <para>Some useful commands in kdb include:
372 <itemizedlist>
373 <listitem><para>lsmod -- Shows where kernel modules are loaded</para></listitem>
374 <listitem><para>ps -- Displays only the active processes</para></listitem>
375 <listitem><para>ps A -- Shows all the processes</para></listitem>
376 <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem>
377 <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem>
378 <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem>
379 <listitem><para>go -- Continue the system</para></listitem>
380 </itemizedlist>
203 </para> 381 </para>
204 <para>When using kgdboc with no debugger proxy, you can end up 382 </listitem>
205 connecting the debugger for one of two entry points. If an 383 <listitem>
206 exception occurs after you have loaded kgdboc a message should print 384 <para>When you are done using kdb you need to consider rebooting the
207 on the console stating it is waiting for the debugger. In case you 385 system or using the "go" command to resuming normal kernel
208 disconnect your terminal program and then connect the debugger in 386 execution. If you have paused the kernel for a lengthy period of
209 its place. If you want to interrupt the target system and forcibly 387 time, applications that rely on timely networking or anything to do
210 enter a debug session you have to issue a Sysrq sequence and then 388 with real wall clock time could be adversely affected, so you
211 type the letter <constant>g</constant>. Then you disconnect the 389 should take this into consideration when using the kernel
212 terminal session and connect gdb. Your options if you don't like 390 debugger.</para>
213 this are to hack gdb to send the sysrq-g for you as well as on the 391 </listitem>
214 initial connect, or to use a debugger proxy that allows an 392 </orderedlist></para>
215 unmodified gdb to do the debugging. 393 </sect1>
394 <sect1 id="quickKDBkeyboard">
395 <title>Quick start for kdb using a keyboard connected console</title>
396 <para>This is a quick example of how to use kdb with a keyboard.</para>
397 <para><orderedlist>
398 <listitem><para>Boot kernel with arguments:
399 <itemizedlist>
400 <listitem><para><constant>kgdboc=kbd</constant></para></listitem>
401 </itemizedlist></para>
402 <para>OR</para>
403 <para>Configure kgdboc after the kernel booted:
404 <itemizedlist>
405 <listitem><para><constant>echo kbd &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
406 </itemizedlist>
216 </para> 407 </para>
217 </sect2> 408 </listitem>
409 <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
410 <itemizedlist>
411 <listitem><para>When logged in as root or with a super user session you can run:</para>
412 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
413 <listitem><para>Example using a laptop keyboard</para>
414 <para>Press and hold down: <constant>Alt</constant></para>
415 <para>Press and hold down: <constant>Fn</constant></para>
416 <para>Press and release the key with the label: <constant>SysRq</constant></para>
417 <para>Release: <constant>Fn</constant></para>
418 <para>Press and release: <constant>g</constant></para>
419 <para>Release: <constant>Alt</constant></para>
420 </listitem>
421 <listitem><para>Example using a PS/2 101-key keyboard</para>
422 <para>Press and hold down: <constant>Alt</constant></para>
423 <para>Press and release the key with the label: <constant>SysRq</constant></para>
424 <para>Press and release: <constant>g</constant></para>
425 <para>Release: <constant>Alt</constant></para>
426 </listitem>
427 </itemizedlist>
428 </listitem>
429 <listitem>
430 <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para>
431 </listitem>
432 </orderedlist></para>
218 </sect1> 433 </sect1>
219 <sect1 id="kgdbcon"> 434 </chapter>
220 <title>Kernel parameter: kgdbcon</title> 435 <chapter id="EnableKGDB">
221 <para> 436 <title>Using kgdb / gdb</title>
222 Kgdb supports using the gdb serial protocol to send console messages 437 <para>In order to use kgdb you must activate it by passing
223 to the debugger when the debugger is connected and running. There 438 configuration information to one of the kgdb I/O drivers. If you
224 are two ways to activate this feature. 439 do not pass any configuration information kgdb will not do anything
440 at all. Kgdb will only actively hook up to the kernel trap hooks
441 if a kgdb I/O driver is loaded and configured. If you unconfigure
442 a kgdb I/O driver, kgdb will unregister all the kernel hook points.
443 </para>
444 <para> All kgdb I/O drivers can be reconfigured at run time, if
445 <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
446 are enabled, by echo'ing a new config string to
447 <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>.
448 The driver can be unconfigured by passing an empty string. You cannot
449 change the configuration while the debugger is attached. Make sure
450 to detach the debugger with the <constant>detach</constant> command
451 prior to trying to unconfigure a kgdb I/O driver.
452 </para>
453 <sect1 id="ConnectingGDB">
454 <title>Connecting with gdb to a serial port</title>
225 <orderedlist> 455 <orderedlist>
226 <listitem><para>Activate with the kernel command line option:</para> 456 <listitem><para>Configure kgdboc</para>
227 <para><constant>kgdbcon</constant></para> 457 <para>Boot kernel with arguments:
458 <itemizedlist>
459 <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
460 </itemizedlist></para>
461 <para>OR</para>
462 <para>Configure kgdboc after the kernel booted:
463 <itemizedlist>
464 <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
465 </itemizedlist></para>
228 </listitem> 466 </listitem>
229 <listitem><para>Use sysfs before configuring an io driver</para> 467 <listitem>
230 <para> 468 <para>Stop kernel execution (break into the debugger)</para>
231 <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant> 469 <para>In order to connect to gdb via kgdboc, the kernel must
232 </para> 470 first be stopped. There are several ways to stop the kernel which
233 <para> 471 include using kgdbwait as a boot argument, via a sysrq-g, or running
234 NOTE: If you do this after you configure the kgdb I/O driver, the 472 the kernel until it takes an exception where it waits for the
235 setting will not take effect until the next point the I/O is 473 debugger to attach.
236 reconfigured. 474 <itemizedlist>
237 </para> 475 <listitem><para>When logged in as root or with a super user session you can run:</para>
476 <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
477 <listitem><para>Example using minicom 2.2</para>
478 <para>Press: <constant>Control-a</constant></para>
479 <para>Press: <constant>f</constant></para>
480 <para>Press: <constant>g</constant></para>
238 </listitem> 481 </listitem>
239 </orderedlist> 482 <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
240 </para> 483 <para>Press: <constant>Control-]</constant></para>
241 <para> 484 <para>Type in:<constant>send break</constant></para>
242 IMPORTANT NOTE: Using this option with kgdb over the console 485 <para>Press: <constant>Enter</constant></para>
243 (kgdboc) is not supported. 486 <para>Press: <constant>g</constant></para>
487 </listitem>
488 </itemizedlist>
244 </para> 489 </para>
245 </sect1> 490 </listitem>
246 </chapter> 491 <listitem>
247 <chapter id="ConnectingGDB"> 492 <para>Connect from from gdb</para>
248 <title>Connecting gdb</title>
249 <para>
250 If you are using kgdboc, you need to have used kgdbwait as a boot
251 argument, issued a sysrq-g, or the system you are going to debug
252 has already taken an exception and is waiting for the debugger to
253 attach before you can connect gdb.
254 </para>
255 <para>
256 If you are not using different kgdb I/O driver other than kgdboc,
257 you should be able to connect and the target will automatically
258 respond.
259 </para>
260 <para> 493 <para>
261 Example (using a serial port): 494 Example (using a directly connected port):
262 </para> 495 </para>
263 <programlisting> 496 <programlisting>
264 % gdb ./vmlinux 497 % gdb ./vmlinux
@@ -266,7 +499,7 @@
266 (gdb) target remote /dev/ttyS0 499 (gdb) target remote /dev/ttyS0
267 </programlisting> 500 </programlisting>
268 <para> 501 <para>
269 Example (kgdb to a terminal server on tcp port 2012): 502 Example (kgdb to a terminal server on TCP port 2012):
270 </para> 503 </para>
271 <programlisting> 504 <programlisting>
272 % gdb ./vmlinux 505 % gdb ./vmlinux
@@ -283,6 +516,83 @@
283 communications. You do this prior to issuing the <constant>target 516 communications. You do this prior to issuing the <constant>target
284 remote</constant> command by typing in: <constant>set debug remote 1</constant> 517 remote</constant> command by typing in: <constant>set debug remote 1</constant>
285 </para> 518 </para>
519 </listitem>
520 </orderedlist>
521 <para>Remember if you continue in gdb, and need to "break in" again,
522 you need to issue an other sysrq-g. It is easy to create a simple
523 entry point by putting a breakpoint at <constant>sys_sync</constant>
524 and then you can run "sync" from a shell or script to break into the
525 debugger.</para>
526 </sect1>
527 </chapter>
528 <chapter id="switchKdbKgdb">
529 <title>kgdb and kdb interoperability</title>
530 <para>It is possible to transition between kdb and kgdb dynamically.
531 The debug core will remember which you used the last time and
532 automatically start in the same mode.</para>
533 <sect1>
534 <title>Switching between kdb and kgdb</title>
535 <sect2>
536 <title>Switching from kgdb to kdb</title>
537 <para>
538 There are two ways to switch from kgdb to kdb: you can use gdb to
539 issue a maintenance packet, or you can blindly type the command $3#33.
540 Whenever kernel debugger stops in kgdb mode it will print the
541 message <constant>KGDB or $3#33 for KDB</constant>. It is important
542 to note that you have to type the sequence correctly in one pass.
543 You cannot type a backspace or delete because kgdb will interpret
544 that as part of the debug stream.
545 <orderedlist>
546 <listitem><para>Change from kgdb to kdb by blindly typing:</para>
547 <para><constant>$3#33</constant></para></listitem>
548 <listitem><para>Change from kgdb to kdb with gdb</para>
549 <para><constant>maintenance packet 3</constant></para>
550 <para>NOTE: Now you must kill gdb. Typically you press control-z and
551 issue the command: kill -9 %</para></listitem>
552 </orderedlist>
553 </para>
554 </sect2>
555 <sect2>
556 <title>Change from kdb to kgdb</title>
557 <para>There are two ways you can change from kdb to kgdb. You can
558 manually enter kgdb mode by issuing the kgdb command from the kdb
559 shell prompt, or you can connect gdb while the kdb shell prompt is
560 active. The kdb shell looks for the typical first commands that gdb
561 would issue with the gdb remote protocol and if it sees one of those
562 commands it automatically changes into kgdb mode.</para>
563 <orderedlist>
564 <listitem><para>From kdb issue the command:</para>
565 <para><constant>kgdb</constant></para>
566 <para>Now disconnect your terminal program and connect gdb in its place</para></listitem>
567 <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem>
568 </orderedlist>
569 </sect2>
570 </sect1>
571 <sect1>
572 <title>Running kdb commands from gdb</title>
573 <para>It is possible to run a limited set of kdb commands from gdb,
574 using the gdb monitor command. You don't want to execute any of the
575 run control or breakpoint operations, because it can disrupt the
576 state of the kernel debugger. You should be using gdb for
577 breakpoints and run control operations if you have gdb connected.
578 The more useful commands to run are things like lsmod, dmesg, ps or
579 possibly some of the memory information commands. To see all the kdb
580 commands you can run <constant>monitor help</constant>.</para>
581 <para>Example:
582 <informalexample><programlisting>
583(gdb) monitor ps
5841 idle process (state I) and
58527 sleeping system daemon (state M) processes suppressed,
586use 'ps A' to see all.
587Task Addr Pid Parent [*] cpu State Thread Command
588
5890xc78291d0 1 0 0 0 S 0xc7829404 init
5900xc7954150 942 1 0 0 S 0xc7954384 dropbear
5910xc78789c0 944 1 0 0 S 0xc7878bf4 sh
592(gdb)
593 </programlisting></informalexample>
594 </para>
595 </sect1>
286 </chapter> 596 </chapter>
287 <chapter id="KGDBTestSuite"> 597 <chapter id="KGDBTestSuite">
288 <title>kgdb Test Suite</title> 598 <title>kgdb Test Suite</title>
@@ -309,34 +619,36 @@
309 </para> 619 </para>
310 </chapter> 620 </chapter>
311 <chapter id="CommonBackEndReq"> 621 <chapter id="CommonBackEndReq">
312 <title>KGDB Internals</title> 622 <title>Kernel Debugger Internals</title>
313 <sect1 id="kgdbArchitecture"> 623 <sect1 id="kgdbArchitecture">
314 <title>Architecture Specifics</title> 624 <title>Architecture Specifics</title>
315 <para> 625 <para>
316 Kgdb is organized into three basic components: 626 The kernel debugger is organized into a number of components:
317 <orderedlist> 627 <orderedlist>
318 <listitem><para>kgdb core</para> 628 <listitem><para>The debug core</para>
319 <para> 629 <para>
320 The kgdb core is found in kernel/kgdb.c. It contains: 630 The debug core is found in kernel/debugger/debug_core.c. It contains:
321 <itemizedlist> 631 <itemizedlist>
322 <listitem><para>All the logic to implement the gdb serial protocol</para></listitem> 632 <listitem><para>A generic OS exception handler which includes
323 <listitem><para>A generic OS exception handler which includes sync'ing the processors into a stopped state on an multi cpu system.</para></listitem> 633 sync'ing the processors into a stopped state on an multi-CPU
634 system.</para></listitem>
324 <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem> 635 <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
325 <listitem><para>The API to make calls to the arch specific kgdb implementation</para></listitem> 636 <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem>
326 <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem> 637 <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
327 <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem> 638 <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
639 <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem>
328 </itemizedlist> 640 </itemizedlist>
329 </para> 641 </para>
330 </listitem> 642 </listitem>
331 <listitem><para>kgdb arch specific implementation</para> 643 <listitem><para>kgdb arch-specific implementation</para>
332 <para> 644 <para>
333 This implementation is generally found in arch/*/kernel/kgdb.c. 645 This implementation is generally found in arch/*/kernel/kgdb.c.
334 As an example, arch/x86/kernel/kgdb.c contains the specifics to 646 As an example, arch/x86/kernel/kgdb.c contains the specifics to
335 implement HW breakpoint as well as the initialization to 647 implement HW breakpoint as well as the initialization to
336 dynamically register and unregister for the trap handlers on 648 dynamically register and unregister for the trap handlers on
337 this architecture. The arch specific portion implements: 649 this architecture. The arch-specific portion implements:
338 <itemizedlist> 650 <itemizedlist>
339 <listitem><para>contains an arch specific trap catcher which 651 <listitem><para>contains an arch-specific trap catcher which
340 invokes kgdb_handle_exception() to start kgdb about doing its 652 invokes kgdb_handle_exception() to start kgdb about doing its
341 work</para></listitem> 653 work</para></listitem>
342 <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem> 654 <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
@@ -347,11 +659,35 @@
347 </itemizedlist> 659 </itemizedlist>
348 </para> 660 </para>
349 </listitem> 661 </listitem>
662 <listitem><para>gdbstub frontend (aka kgdb)</para>
663 <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para>
664 <itemizedlist>
665 <listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
666 </itemizedlist>
667 </listitem>
668 <listitem><para>kdb frontend</para>
669 <para>The kdb debugger shell is broken down into a number of
670 components. The kdb core is located in kernel/debug/kdb. There
671 are a number of helper functions in some of the other kernel
672 components to make it possible for kdb to examine and report
673 information about the kernel without taking locks that could
674 cause a kernel deadlock. The kdb core contains implements the following functionality.</para>
675 <itemizedlist>
676 <listitem><para>A simple shell</para></listitem>
677 <listitem><para>The kdb core command set</para></listitem>
678 <listitem><para>A registration API to register additional kdb shell commands.</para>
679 <para>A good example of a self-contained kdb module is the "ftdump" command for dumping the ftrace buffer. See: kernel/trace/trace_kdb.c</para></listitem>
680 <listitem><para>The implementation for kdb_printf() which
681 emits messages directly to I/O drivers, bypassing the kernel
682 log.</para></listitem>
683 <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem>
684 </itemizedlist>
685 </listitem>
350 <listitem><para>kgdb I/O driver</para> 686 <listitem><para>kgdb I/O driver</para>
351 <para> 687 <para>
352 Each kgdb I/O driver has to provide an implemenation for the following: 688 Each kgdb I/O driver has to provide an implementation for the following:
353 <itemizedlist> 689 <itemizedlist>
354 <listitem><para>configuration via builtin or module</para></listitem> 690 <listitem><para>configuration via built-in or module</para></listitem>
355 <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem> 691 <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
356 <listitem><para>read and write character interface</para></listitem> 692 <listitem><para>read and write character interface</para></listitem>
357 <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem> 693 <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
@@ -416,15 +752,15 @@
416 underlying low level to the hardware driver having "polling hooks" 752 underlying low level to the hardware driver having "polling hooks"
417 which the to which the tty driver is attached. In the initial 753 which the to which the tty driver is attached. In the initial
418 implementation of kgdboc it the serial_core was changed to expose a 754 implementation of kgdboc it the serial_core was changed to expose a
419 low level uart hook for doing polled mode reading and writing of a 755 low level UART hook for doing polled mode reading and writing of a
420 single character while in an atomic context. When kgdb makes an I/O 756 single character while in an atomic context. When kgdb makes an I/O
421 request to the debugger, kgdboc invokes a call back in the serial 757 request to the debugger, kgdboc invokes a call back in the serial
422 core which in turn uses the call back in the uart driver. It is 758 core which in turn uses the call back in the UART driver. It is
423 certainly possible to extend kgdboc to work with non-uart based 759 certainly possible to extend kgdboc to work with non-UART based
424 consoles in the future. 760 consoles in the future.
425 </para> 761 </para>
426 <para> 762 <para>
427 When using kgdboc with a uart, the uart driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting> 763 When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
428#ifdef CONFIG_CONSOLE_POLL 764#ifdef CONFIG_CONSOLE_POLL
429 .poll_get_char = serial8250_get_poll_char, 765 .poll_get_char = serial8250_get_poll_char,
430 .poll_put_char = serial8250_put_poll_char, 766 .poll_put_char = serial8250_put_poll_char,
@@ -434,7 +770,7 @@
434 <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above. 770 <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
435 Keep in mind that polling hooks have to be implemented in such a way 771 Keep in mind that polling hooks have to be implemented in such a way
436 that they can be called from an atomic context and have to restore 772 that they can be called from an atomic context and have to restore
437 the state of the uart chip on return such that the system can return 773 the state of the UART chip on return such that the system can return
438 to normal when the debugger detaches. You need to be very careful 774 to normal when the debugger detaches. You need to be very careful
439 with any kind of lock you consider, because failing here is most 775 with any kind of lock you consider, because failing here is most
440 going to mean pressing the reset button. 776 going to mean pressing the reset button.
@@ -453,6 +789,10 @@
453 <itemizedlist> 789 <itemizedlist>
454 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem> 790 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
455 </itemizedlist> 791 </itemizedlist>
792 In Jan 2010 this document was updated to include kdb.
793 <itemizedlist>
794 <listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
795 </itemizedlist>
456 </para> 796 </para>
457 </chapter> 797 </chapter>
458</book> 798</book>
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index ba9975771503..8c5411cfeaf0 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -81,16 +81,14 @@ void (*port_disable) (struct ata_port *);
81 </programlisting> 81 </programlisting>
82 82
83 <para> 83 <para>
84 Called from ata_bus_probe() and ata_bus_reset() error paths, 84 Called from ata_bus_probe() error path, as well as when
85 as well as when unregistering from the SCSI module (rmmod, hot 85 unregistering from the SCSI module (rmmod, hot unplug).
86 unplug).
87 This function should do whatever needs to be done to take the 86 This function should do whatever needs to be done to take the
88 port out of use. In most cases, ata_port_disable() can be used 87 port out of use. In most cases, ata_port_disable() can be used
89 as this hook. 88 as this hook.
90 </para> 89 </para>
91 <para> 90 <para>
92 Called from ata_bus_probe() on a failed probe. 91 Called from ata_bus_probe() on a failed probe.
93 Called from ata_bus_reset() on a failed bus reset.
94 Called from ata_scsi_release(). 92 Called from ata_scsi_release().
95 </para> 93 </para>
96 94
@@ -107,10 +105,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *);
107 issue of SET FEATURES - XFER MODE, and prior to operation. 105 issue of SET FEATURES - XFER MODE, and prior to operation.
108 </para> 106 </para>
109 <para> 107 <para>
110 Called by ata_device_add() after ata_dev_identify() determines
111 a device is present.
112 </para>
113 <para>
114 This entry may be specified as NULL in ata_port_operations. 108 This entry may be specified as NULL in ata_port_operations.
115 </para> 109 </para>
116 110
@@ -154,8 +148,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in
154 148
155 <sect2><title>Taskfile read/write</title> 149 <sect2><title>Taskfile read/write</title>
156 <programlisting> 150 <programlisting>
157void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); 151void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
158void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); 152void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
159 </programlisting> 153 </programlisting>
160 154
161 <para> 155 <para>
@@ -164,36 +158,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
164 hardware registers / DMA buffers, to obtain the current set of 158 hardware registers / DMA buffers, to obtain the current set of
165 taskfile register values. 159 taskfile register values.
166 Most drivers for taskfile-based hardware (PIO or MMIO) use 160 Most drivers for taskfile-based hardware (PIO or MMIO) use
167 ata_tf_load() and ata_tf_read() for these hooks. 161 ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
168 </para> 162 </para>
169 163
170 </sect2> 164 </sect2>
171 165
172 <sect2><title>PIO data read/write</title> 166 <sect2><title>PIO data read/write</title>
173 <programlisting> 167 <programlisting>
174void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); 168void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
175 </programlisting> 169 </programlisting>
176 170
177 <para> 171 <para>
178All bmdma-style drivers must implement this hook. This is the low-level 172All bmdma-style drivers must implement this hook. This is the low-level
179operation that actually copies the data bytes during a PIO data 173operation that actually copies the data bytes during a PIO data
180transfer. 174transfer.
181Typically the driver 175Typically the driver will choose one of ata_sff_data_xfer_noirq(),
182will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or 176ata_sff_data_xfer(), or ata_sff_data_xfer32().
183ata_mmio_data_xfer().
184 </para> 177 </para>
185 178
186 </sect2> 179 </sect2>
187 180
188 <sect2><title>ATA command execute</title> 181 <sect2><title>ATA command execute</title>
189 <programlisting> 182 <programlisting>
190void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); 183void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
191 </programlisting> 184 </programlisting>
192 185
193 <para> 186 <para>
194 causes an ATA command, previously loaded with 187 causes an ATA command, previously loaded with
195 ->tf_load(), to be initiated in hardware. 188 ->tf_load(), to be initiated in hardware.
196 Most drivers for taskfile-based hardware use ata_exec_command() 189 Most drivers for taskfile-based hardware use ata_sff_exec_command()
197 for this hook. 190 for this hook.
198 </para> 191 </para>
199 192
@@ -218,8 +211,8 @@ command.
218 211
219 <sect2><title>Read specific ATA shadow registers</title> 212 <sect2><title>Read specific ATA shadow registers</title>
220 <programlisting> 213 <programlisting>
221u8 (*check_status)(struct ata_port *ap); 214u8 (*sff_check_status)(struct ata_port *ap);
222u8 (*check_altstatus)(struct ata_port *ap); 215u8 (*sff_check_altstatus)(struct ata_port *ap);
223 </programlisting> 216 </programlisting>
224 217
225 <para> 218 <para>
@@ -227,20 +220,26 @@ u8 (*check_altstatus)(struct ata_port *ap);
227 hardware. On some hardware, reading the Status register has 220 hardware. On some hardware, reading the Status register has
228 the side effect of clearing the interrupt condition. 221 the side effect of clearing the interrupt condition.
229 Most drivers for taskfile-based hardware use 222 Most drivers for taskfile-based hardware use
230 ata_check_status() for this hook. 223 ata_sff_check_status() for this hook.
231 </para> 224 </para>
225
226 </sect2>
227
228 <sect2><title>Write specific ATA shadow register</title>
229 <programlisting>
230void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
231 </programlisting>
232
232 <para> 233 <para>
233 Note that because this is called from ata_device_add(), at 234 Write the device control ATA shadow register to the hardware.
234 least a dummy function that clears device interrupts must be 235 Most drivers don't need to define this.
235 provided for all drivers, even if the controller doesn't
236 actually have a taskfile status register.
237 </para> 236 </para>
238 237
239 </sect2> 238 </sect2>
240 239
241 <sect2><title>Select ATA device on bus</title> 240 <sect2><title>Select ATA device on bus</title>
242 <programlisting> 241 <programlisting>
243void (*dev_select)(struct ata_port *ap, unsigned int device); 242void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
244 </programlisting> 243 </programlisting>
245 244
246 <para> 245 <para>
@@ -251,9 +250,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device);
251 </para> 250 </para>
252 <para> 251 <para>
253 Most drivers for taskfile-based hardware use 252 Most drivers for taskfile-based hardware use
254 ata_std_dev_select() for this hook. Controllers which do not 253 ata_sff_dev_select() for this hook.
255 support second drives on a port (such as SATA contollers) will
256 use ata_noop_dev_select().
257 </para> 254 </para>
258 255
259 </sect2> 256 </sect2>
@@ -441,13 +438,13 @@ void (*irq_clear) (struct ata_port *);
441 to struct ata_host_set. 438 to struct ata_host_set.
442 </para> 439 </para>
443 <para> 440 <para>
444 Most legacy IDE drivers use ata_interrupt() for the 441 Most legacy IDE drivers use ata_sff_interrupt() for the
445 irq_handler hook, which scans all ports in the host_set, 442 irq_handler hook, which scans all ports in the host_set,
446 determines which queued command was active (if any), and calls 443 determines which queued command was active (if any), and calls
447 ata_host_intr(ap,qc). 444 ata_sff_host_intr(ap,qc).
448 </para> 445 </para>
449 <para> 446 <para>
450 Most legacy IDE drivers use ata_bmdma_irq_clear() for the 447 Most legacy IDE drivers use ata_sff_irq_clear() for the
451 irq_clear() hook, which simply clears the interrupt and error 448 irq_clear() hook, which simply clears the interrupt and error
452 flags in the DMA status register. 449 flags in the DMA status register.
453 </para> 450 </para>
@@ -490,16 +487,12 @@ void (*host_stop) (struct ata_host_set *host_set);
490 allocates space for a legacy IDE PRD table and returns. 487 allocates space for a legacy IDE PRD table and returns.
491 </para> 488 </para>
492 <para> 489 <para>
493 ->port_stop() is called after ->host_stop(). It's sole function 490 ->port_stop() is called after ->host_stop(). Its sole function
494 is to release DMA/memory resources, now that they are no longer 491 is to release DMA/memory resources, now that they are no longer
495 actively being used. Many drivers also free driver-private 492 actively being used. Many drivers also free driver-private
496 data from port at this time. 493 data from port at this time.
497 </para> 494 </para>
498 <para> 495 <para>
499 Many drivers use ata_port_stop() as this hook, which frees the
500 PRD table.
501 </para>
502 <para>
503 ->host_stop() is called after all ->port_stop() calls 496 ->host_stop() is called after all ->port_stop() calls
504have completed. The hook must finalize hardware shutdown, release DMA 497have completed. The hook must finalize hardware shutdown, release DMA
505and other resources, etc. 498and other resources, etc.
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index f3f37f141dbd..affb15a344a1 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -144,7 +144,7 @@ usage should require reading the full document.
144 this though and the recommendation to allow only a single 144 this though and the recommendation to allow only a single
145 interface in STA mode at first! 145 interface in STA mode at first!
146 </para> 146 </para>
147!Finclude/net/mac80211.h ieee80211_if_init_conf 147!Finclude/net/mac80211.h ieee80211_vif
148 </chapter> 148 </chapter>
149 149
150 <chapter id="rx-tx"> 150 <chapter id="rx-tx">
@@ -234,7 +234,6 @@ usage should require reading the full document.
234 <title>Multiple queues and QoS support</title> 234 <title>Multiple queues and QoS support</title>
235 <para>TBD</para> 235 <para>TBD</para>
236!Finclude/net/mac80211.h ieee80211_tx_queue_params 236!Finclude/net/mac80211.h ieee80211_tx_queue_params
237!Finclude/net/mac80211.h ieee80211_tx_queue_stats
238 </chapter> 237 </chapter>
239 238
240 <chapter id="AP"> 239 <chapter id="AP">
diff --git a/Documentation/DocBook/media-entities.tmpl b/Documentation/DocBook/media-entities.tmpl
index c725cb852c54..5d4d40f429a5 100644
--- a/Documentation/DocBook/media-entities.tmpl
+++ b/Documentation/DocBook/media-entities.tmpl
@@ -17,6 +17,7 @@
17<!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>"> 17<!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>">
18<!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>"> 18<!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>">
19<!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>"> 19<!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>">
20<!ENTITY VIDIOC-DQEVENT "<link linkend='vidioc-dqevent'><constant>VIDIOC_DQEVENT</constant></link>">
20<!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>"> 21<!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>">
21<!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>"> 22<!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>">
22<!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>"> 23<!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>">
@@ -60,6 +61,7 @@
60<!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>"> 61<!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>">
61<!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>"> 62<!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>">
62<!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>"> 63<!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>">
64<!ENTITY VIDIOC-SUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_SUBSCRIBE_EVENT</constant></link>">
63<!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>"> 65<!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>">
64<!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>"> 66<!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>">
65<!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>"> 67<!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>">
@@ -83,6 +85,7 @@
83<!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>"> 85<!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>">
84<!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>"> 86<!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>">
85<!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>"> 87<!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>">
88<!ENTITY VIDIOC-UNSUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_UNSUBSCRIBE_EVENT</constant></link>">
86 89
87<!-- Types --> 90<!-- Types -->
88<!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>"> 91<!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>">
@@ -141,6 +144,9 @@
141<!ENTITY v4l2-enc-idx "struct&nbsp;<link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>"> 144<!ENTITY v4l2-enc-idx "struct&nbsp;<link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>">
142<!ENTITY v4l2-enc-idx-entry "struct&nbsp;<link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>"> 145<!ENTITY v4l2-enc-idx-entry "struct&nbsp;<link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>">
143<!ENTITY v4l2-encoder-cmd "struct&nbsp;<link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>"> 146<!ENTITY v4l2-encoder-cmd "struct&nbsp;<link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>">
147<!ENTITY v4l2-event "struct&nbsp;<link linkend='v4l2-event'>v4l2_event</link>">
148<!ENTITY v4l2-event-subscription "struct&nbsp;<link linkend='v4l2-event-subscription'>v4l2_event_subscription</link>">
149<!ENTITY v4l2-event-vsync "struct&nbsp;<link linkend='v4l2-event-vsync'>v4l2_event_vsync</link>">
144<!ENTITY v4l2-ext-control "struct&nbsp;<link linkend='v4l2-ext-control'>v4l2_ext_control</link>"> 150<!ENTITY v4l2-ext-control "struct&nbsp;<link linkend='v4l2-ext-control'>v4l2_ext_control</link>">
145<!ENTITY v4l2-ext-controls "struct&nbsp;<link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>"> 151<!ENTITY v4l2-ext-controls "struct&nbsp;<link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>">
146<!ENTITY v4l2-fmtdesc "struct&nbsp;<link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>"> 152<!ENTITY v4l2-fmtdesc "struct&nbsp;<link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>">
@@ -200,6 +206,7 @@
200<!ENTITY sub-controls SYSTEM "v4l/controls.xml"> 206<!ENTITY sub-controls SYSTEM "v4l/controls.xml">
201<!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml"> 207<!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml">
202<!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml"> 208<!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml">
209<!ENTITY sub-dev-event SYSTEM "v4l/dev-event.xml">
203<!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml"> 210<!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml">
204<!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml"> 211<!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml">
205<!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml"> 212<!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml">
@@ -292,6 +299,8 @@
292<!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml"> 299<!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml">
293<!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml"> 300<!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml">
294<!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml"> 301<!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml">
302<!ENTITY sub-dqevent SYSTEM "v4l/vidioc-dqevent.xml">
303<!ENTITY sub-subscribe-event SYSTEM "v4l/vidioc-subscribe-event.xml">
295<!ENTITY sub-intro SYSTEM "dvb/intro.xml"> 304<!ENTITY sub-intro SYSTEM "dvb/intro.xml">
296<!ENTITY sub-frontend SYSTEM "dvb/frontend.xml"> 305<!ENTITY sub-frontend SYSTEM "dvb/frontend.xml">
297<!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml"> 306<!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml">
@@ -381,3 +390,5 @@
381<!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml"> 390<!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml">
382<!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml"> 391<!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml">
383<!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml"> 392<!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml">
393<!ENTITY dqevent SYSTEM "v4l/vidioc-dqevent.xml">
394<!ENTITY subscribe_event SYSTEM "v4l/vidioc-subscribe-event.xml">
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 5e7d84b48505..020ac80d4682 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -269,7 +269,7 @@ static void board_hwcontrol(struct mtd_info *mtd, int cmd)
269 information about the device. 269 information about the device.
270 </para> 270 </para>
271 <programlisting> 271 <programlisting>
272int __init board_init (void) 272static int __init board_init (void)
273{ 273{
274 struct nand_chip *this; 274 struct nand_chip *this;
275 int err = 0; 275 int err = 0;
@@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
488 The ECC bytes must be placed immidiately after the data 488 The ECC bytes must be placed immidiately after the data
489 bytes in order to make the syndrome generator work. This 489 bytes in order to make the syndrome generator work. This
490 is contrary to the usual layout used by software ECC. The 490 is contrary to the usual layout used by software ECC. The
491 seperation of data and out of band area is not longer 491 separation of data and out of band area is not longer
492 possible. The nand driver code handles this layout and 492 possible. The nand driver code handles this layout and
493 the remaining free bytes in the oob area are managed by 493 the remaining free bytes in the oob area are managed by
494 the autoplacement code. Provide a matching oob-layout 494 the autoplacement code. Provide a matching oob-layout
@@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
560 bad blocks. They have factory marked good blocks. The marker pattern 560 bad blocks. They have factory marked good blocks. The marker pattern
561 is erased when the block is erased to be reused. So in case of 561 is erased when the block is erased to be reused. So in case of
562 powerloss before writing the pattern back to the chip this block 562 powerloss before writing the pattern back to the chip this block
563 would be lost and added to the bad blocks. Therefor we scan the 563 would be lost and added to the bad blocks. Therefore we scan the
564 chip(s) when we detect them the first time for good blocks and 564 chip(s) when we detect them the first time for good blocks and
565 store this information in a bad block table before erasing any 565 store this information in a bad block table before erasing any
566 of the blocks. 566 of the blocks.
@@ -1094,7 +1094,7 @@ in this page</entry>
1094 manufacturers specifications. This applies similar to the spare area. 1094 manufacturers specifications. This applies similar to the spare area.
1095 </para> 1095 </para>
1096 <para> 1096 <para>
1097 Therefor NAND aware filesystems must either write in page size chunks 1097 Therefore NAND aware filesystems must either write in page size chunks
1098 or hold a writebuffer to collect smaller writes until they sum up to 1098 or hold a writebuffer to collect smaller writes until they sum up to
1099 pagesize. Available NAND aware filesystems: JFFS2, YAFFS. 1099 pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
1100 </para> 1100 </para>
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
index 0c3dc4c69dd1..d858d92cf6d9 100644
--- a/Documentation/DocBook/sh.tmpl
+++ b/Documentation/DocBook/sh.tmpl
@@ -19,13 +19,17 @@
19 </authorgroup> 19 </authorgroup>
20 20
21 <copyright> 21 <copyright>
22 <year>2008</year> 22 <year>2008-2010</year>
23 <holder>Paul Mundt</holder> 23 <holder>Paul Mundt</holder>
24 </copyright> 24 </copyright>
25 <copyright> 25 <copyright>
26 <year>2008</year> 26 <year>2008-2010</year>
27 <holder>Renesas Technology Corp.</holder> 27 <holder>Renesas Technology Corp.</holder>
28 </copyright> 28 </copyright>
29 <copyright>
30 <year>2010</year>
31 <holder>Renesas Electronics Corp.</holder>
32 </copyright>
29 33
30 <legalnotice> 34 <legalnotice>
31 <para> 35 <para>
@@ -77,7 +81,7 @@
77 </chapter> 81 </chapter>
78 <chapter id="clk"> 82 <chapter id="clk">
79 <title>Clock Framework Extensions</title> 83 <title>Clock Framework Extensions</title>
80!Iarch/sh/include/asm/clock.h 84!Iinclude/linux/sh_clk.h
81 </chapter> 85 </chapter>
82 <chapter id="mach"> 86 <chapter id="mach">
83 <title>Machine Specific Interfaces</title> 87 <title>Machine Specific Interfaces</title>
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index 8bca1d5cec09..e8473eae2a20 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -16,6 +16,15 @@
16 </address> 16 </address>
17 </affiliation> 17 </affiliation>
18 </author> 18 </author>
19 <author>
20 <firstname>William</firstname>
21 <surname>Cohen</surname>
22 <affiliation>
23 <address>
24 <email>wcohen@redhat.com</email>
25 </address>
26 </affiliation>
27 </author>
19 </authorgroup> 28 </authorgroup>
20 29
21 <legalnotice> 30 <legalnotice>
@@ -91,4 +100,8 @@
91!Iinclude/trace/events/signal.h 100!Iinclude/trace/events/signal.h
92 </chapter> 101 </chapter>
93 102
103 <chapter id="block">
104 <title>Block IO</title>
105!Iinclude/trace/events/block.h
106 </chapter>
94</book> 107</book>
diff --git a/Documentation/DocBook/v4l/common.xml b/Documentation/DocBook/v4l/common.xml
index c65f0ac9b6ee..cea23e1c4fc6 100644
--- a/Documentation/DocBook/v4l/common.xml
+++ b/Documentation/DocBook/v4l/common.xml
@@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be
1170captured or output, applications can request frame skipping or 1170captured or output, applications can request frame skipping or
1171duplicating on the driver side. This is especially useful when using 1171duplicating on the driver side. This is especially useful when using
1172the &func-read; or &func-write;, which are not augmented by timestamps 1172the &func-read; or &func-write;, which are not augmented by timestamps
1173or sequence counters, and to avoid unneccessary data copying.</para> 1173or sequence counters, and to avoid unnecessary data copying.</para>
1174 1174
1175 <para>Finally these ioctls can be used to determine the number of 1175 <para>Finally these ioctls can be used to determine the number of
1176buffers used internally by a driver in read/write mode. For 1176buffers used internally by a driver in read/write mode. For
diff --git a/Documentation/DocBook/v4l/compat.xml b/Documentation/DocBook/v4l/compat.xml
index b9dbdf9e6d29..b42b935913cd 100644
--- a/Documentation/DocBook/v4l/compat.xml
+++ b/Documentation/DocBook/v4l/compat.xml
@@ -2332,15 +2332,26 @@ more information.</para>
2332 </listitem> 2332 </listitem>
2333 </orderedlist> 2333 </orderedlist>
2334 </section> 2334 </section>
2335 </section> 2335 <section>
2336 <title>V4L2 in Linux 2.6.34</title>
2337 <orderedlist>
2338 <listitem>
2339 <para>Added
2340<constant>V4L2_CID_IRIS_ABSOLUTE</constant> and
2341<constant>V4L2_CID_IRIS_RELATIVE</constant> controls to the
2342 <link linkend="camera-controls">Camera controls class</link>.
2343 </para>
2344 </listitem>
2345 </orderedlist>
2346 </section>
2336 2347
2337 <section id="other"> 2348 <section id="other">
2338 <title>Relation of V4L2 to other Linux multimedia APIs</title> 2349 <title>Relation of V4L2 to other Linux multimedia APIs</title>
2339 2350
2340 <section id="xvideo"> 2351 <section id="xvideo">
2341 <title>X Video Extension</title> 2352 <title>X Video Extension</title>
2342 2353
2343 <para>The X Video Extension (abbreviated XVideo or just Xv) is 2354 <para>The X Video Extension (abbreviated XVideo or just Xv) is
2344an extension of the X Window system, implemented for example by the 2355an extension of the X Window system, implemented for example by the
2345XFree86 project. Its scope is similar to V4L2, an API to video capture 2356XFree86 project. Its scope is similar to V4L2, an API to video capture
2346and output devices for X clients. Xv allows applications to display 2357and output devices for X clients. Xv allows applications to display
@@ -2351,7 +2362,7 @@ capture or output still images in XPixmaps<footnote>
2351extension available across many operating systems and 2362extension available across many operating systems and
2352architectures.</para> 2363architectures.</para>
2353 2364
2354 <para>Because the driver is embedded into the X server Xv has a 2365 <para>Because the driver is embedded into the X server Xv has a
2355number of advantages over the V4L2 <link linkend="overlay">video 2366number of advantages over the V4L2 <link linkend="overlay">video
2356overlay interface</link>. The driver can easily determine the overlay 2367overlay interface</link>. The driver can easily determine the overlay
2357target, &ie; visible graphics memory or off-screen buffers for a 2368target, &ie; visible graphics memory or off-screen buffers for a
@@ -2360,16 +2371,16 @@ overlay, scaling or color-keying, or the clipping functions of the
2360video capture hardware, always in sync with drawing operations or 2371video capture hardware, always in sync with drawing operations or
2361windows moving or changing their stacking order.</para> 2372windows moving or changing their stacking order.</para>
2362 2373
2363 <para>To combine the advantages of Xv and V4L a special Xv 2374 <para>To combine the advantages of Xv and V4L a special Xv
2364driver exists in XFree86 and XOrg, just programming any overlay capable 2375driver exists in XFree86 and XOrg, just programming any overlay capable
2365Video4Linux device it finds. To enable it 2376Video4Linux device it finds. To enable it
2366<filename>/etc/X11/XF86Config</filename> must contain these lines:</para> 2377<filename>/etc/X11/XF86Config</filename> must contain these lines:</para>
2367 <para><screen> 2378 <para><screen>
2368Section "Module" 2379Section "Module"
2369 Load "v4l" 2380 Load "v4l"
2370EndSection</screen></para> 2381EndSection</screen></para>
2371 2382
2372 <para>As of XFree86 4.2 this driver still supports only V4L 2383 <para>As of XFree86 4.2 this driver still supports only V4L
2373ioctls, however it should work just fine with all V4L2 devices through 2384ioctls, however it should work just fine with all V4L2 devices through
2374the V4L2 backward-compatibility layer. Since V4L2 permits multiple 2385the V4L2 backward-compatibility layer. Since V4L2 permits multiple
2375opens it is possible (if supported by the V4L2 driver) to capture 2386opens it is possible (if supported by the V4L2 driver) to capture
@@ -2377,83 +2388,84 @@ video while an X client requested video overlay. Restrictions of
2377simultaneous capturing and overlay are discussed in <xref 2388simultaneous capturing and overlay are discussed in <xref
2378 linkend="overlay" /> apply.</para> 2389 linkend="overlay" /> apply.</para>
2379 2390
2380 <para>Only marginally related to V4L2, XFree86 extended Xv to 2391 <para>Only marginally related to V4L2, XFree86 extended Xv to
2381support hardware YUV to RGB conversion and scaling for faster video 2392support hardware YUV to RGB conversion and scaling for faster video
2382playback, and added an interface to MPEG-2 decoding hardware. This API 2393playback, and added an interface to MPEG-2 decoding hardware. This API
2383is useful to display images captured with V4L2 devices.</para> 2394is useful to display images captured with V4L2 devices.</para>
2384 </section> 2395 </section>
2385 2396
2386 <section> 2397 <section>
2387 <title>Digital Video</title> 2398 <title>Digital Video</title>
2388 2399
2389 <para>V4L2 does not support digital terrestrial, cable or 2400 <para>V4L2 does not support digital terrestrial, cable or
2390satellite broadcast. A separate project aiming at digital receivers 2401satellite broadcast. A separate project aiming at digital receivers
2391exists. You can find its homepage at <ulink 2402exists. You can find its homepage at <ulink
2392url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API 2403url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API
2393has no connection to the V4L2 API except that drivers for hybrid 2404has no connection to the V4L2 API except that drivers for hybrid
2394hardware may support both.</para> 2405hardware may support both.</para>
2395 </section> 2406 </section>
2396 2407
2397 <section> 2408 <section>
2398 <title>Audio Interfaces</title> 2409 <title>Audio Interfaces</title>
2399 2410
2400 <para>[to do - OSS/ALSA]</para> 2411 <para>[to do - OSS/ALSA]</para>
2412 </section>
2401 </section> 2413 </section>
2402 </section>
2403 2414
2404 <section id="experimental"> 2415 <section id="experimental">
2405 <title>Experimental API Elements</title> 2416 <title>Experimental API Elements</title>
2406 2417
2407 <para>The following V4L2 API elements are currently experimental 2418 <para>The following V4L2 API elements are currently experimental
2408and may change in the future.</para> 2419and may change in the future.</para>
2409 2420
2410 <itemizedlist> 2421 <itemizedlist>
2411 <listitem> 2422 <listitem>
2412 <para>Video Output Overlay (OSD) Interface, <xref 2423 <para>Video Output Overlay (OSD) Interface, <xref
2413 linkend="osd" />.</para> 2424 linkend="osd" />.</para>
2414 </listitem> 2425 </listitem>
2415 <listitem> 2426 <listitem>
2416 <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>, 2427 <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>,
2417 &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para> 2428 &v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para>
2418 </listitem> 2429 </listitem>
2419 <listitem> 2430 <listitem>
2420 <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>, 2431 <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>,
2421&VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para> 2432&VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para>
2422 </listitem> 2433 </listitem>
2423 <listitem> 2434 <listitem>
2424 <para>&VIDIOC-ENUM-FRAMESIZES; and 2435 <para>&VIDIOC-ENUM-FRAMESIZES; and
2425&VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para> 2436&VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para>
2426 </listitem> 2437 </listitem>
2427 <listitem> 2438 <listitem>
2428 <para>&VIDIOC-G-ENC-INDEX; ioctl.</para> 2439 <para>&VIDIOC-G-ENC-INDEX; ioctl.</para>
2429 </listitem> 2440 </listitem>
2430 <listitem> 2441 <listitem>
2431 <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD; 2442 <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD;
2432ioctls.</para> 2443ioctls.</para>
2433 </listitem> 2444 </listitem>
2434 <listitem> 2445 <listitem>
2435 <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER; 2446 <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER;
2436ioctls.</para> 2447ioctls.</para>
2437 </listitem> 2448 </listitem>
2438 <listitem> 2449 <listitem>
2439 <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para> 2450 <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para>
2440 </listitem> 2451 </listitem>
2441 </itemizedlist> 2452 </itemizedlist>
2442 </section> 2453 </section>
2443 2454
2444 <section id="obsolete"> 2455 <section id="obsolete">
2445 <title>Obsolete API Elements</title> 2456 <title>Obsolete API Elements</title>
2446 2457
2447 <para>The following V4L2 API elements were superseded by new 2458 <para>The following V4L2 API elements were superseded by new
2448interfaces and should not be implemented in new drivers.</para> 2459interfaces and should not be implemented in new drivers.</para>
2449 2460
2450 <itemizedlist> 2461 <itemizedlist>
2451 <listitem> 2462 <listitem>
2452 <para><constant>VIDIOC_G_MPEGCOMP</constant> and 2463 <para><constant>VIDIOC_G_MPEGCOMP</constant> and
2453<constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls, 2464<constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls,
2454<xref linkend="extended-controls" />.</para> 2465<xref linkend="extended-controls" />.</para>
2455 </listitem> 2466 </listitem>
2456 </itemizedlist> 2467 </itemizedlist>
2468 </section>
2457 </section> 2469 </section>
2458 2470
2459 <!-- 2471 <!--
diff --git a/Documentation/DocBook/v4l/controls.xml b/Documentation/DocBook/v4l/controls.xml
index f46450610412..8408caaee276 100644
--- a/Documentation/DocBook/v4l/controls.xml
+++ b/Documentation/DocBook/v4l/controls.xml
@@ -267,6 +267,12 @@ minimum value disables backlight compensation.</entry>
267 <entry>Chroma automatic gain control.</entry> 267 <entry>Chroma automatic gain control.</entry>
268 </row> 268 </row>
269 <row> 269 <row>
270 <entry><constant>V4L2_CID_CHROMA_GAIN</constant></entry>
271 <entry>integer</entry>
272 <entry>Adjusts the Chroma gain control (for use when chroma AGC
273 is disabled).</entry>
274 </row>
275 <row>
270 <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry> 276 <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry>
271 <entry>boolean</entry> 277 <entry>boolean</entry>
272 <entry>Enable the color killer (&ie; force a black &amp; white image in case of a weak video signal).</entry> 278 <entry>Enable the color killer (&ie; force a black &amp; white image in case of a weak video signal).</entry>
@@ -277,8 +283,15 @@ minimum value disables backlight compensation.</entry>
277 <entry>Selects a color effect. Possible values for 283 <entry>Selects a color effect. Possible values for
278<constant>enum v4l2_colorfx</constant> are: 284<constant>enum v4l2_colorfx</constant> are:
279<constant>V4L2_COLORFX_NONE</constant> (0), 285<constant>V4L2_COLORFX_NONE</constant> (0),
280<constant>V4L2_COLORFX_BW</constant> (1) and 286<constant>V4L2_COLORFX_BW</constant> (1),
281<constant>V4L2_COLORFX_SEPIA</constant> (2).</entry> 287<constant>V4L2_COLORFX_SEPIA</constant> (2),
288<constant>V4L2_COLORFX_NEGATIVE</constant> (3),
289<constant>V4L2_COLORFX_EMBOSS</constant> (4),
290<constant>V4L2_COLORFX_SKETCH</constant> (5),
291<constant>V4L2_COLORFX_SKY_BLUE</constant> (6),
292<constant>V4L2_COLORFX_GRASS_GREEN</constant> (7),
293<constant>V4L2_COLORFX_SKIN_WHITEN</constant> (8) and
294<constant>V4L2_COLORFX_VIVID</constant> (9).</entry>
282 </row> 295 </row>
283 <row> 296 <row>
284 <entry><constant>V4L2_CID_ROTATE</constant></entry> 297 <entry><constant>V4L2_CID_ROTATE</constant></entry>
@@ -1825,6 +1838,25 @@ wide-angle direction. The zoom speed unit is driver-specific.</entry>
1825 <row><entry></entry></row> 1838 <row><entry></entry></row>
1826 1839
1827 <row> 1840 <row>
1841 <entry spanname="id"><constant>V4L2_CID_IRIS_ABSOLUTE</constant>&nbsp;</entry>
1842 <entry>integer</entry>
1843 </row><row><entry spanname="descr">This control sets the
1844camera's aperture to the specified value. The unit is undefined.
1845Larger values open the iris wider, smaller values close it.</entry>
1846 </row>
1847 <row><entry></entry></row>
1848
1849 <row>
1850 <entry spanname="id"><constant>V4L2_CID_IRIS_RELATIVE</constant>&nbsp;</entry>
1851 <entry>integer</entry>
1852 </row><row><entry spanname="descr">This control modifies the
1853camera's aperture by the specified amount. The unit is undefined.
1854Positive values open the iris one step further, negative values close
1855it one step further. This is a write-only control.</entry>
1856 </row>
1857 <row><entry></entry></row>
1858
1859 <row>
1828 <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant>&nbsp;</entry> 1860 <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant>&nbsp;</entry>
1829 <entry>boolean</entry> 1861 <entry>boolean</entry>
1830 </row><row><entry spanname="descr">Prevent video from being acquired 1862 </row><row><entry spanname="descr">Prevent video from being acquired
diff --git a/Documentation/DocBook/v4l/dev-event.xml b/Documentation/DocBook/v4l/dev-event.xml
new file mode 100644
index 000000000000..be5a98fb4fab
--- /dev/null
+++ b/Documentation/DocBook/v4l/dev-event.xml
@@ -0,0 +1,31 @@
1 <title>Event Interface</title>
2
3 <para>The V4L2 event interface provides means for user to get
4 immediately notified on certain conditions taking place on a device.
5 This might include start of frame or loss of signal events, for
6 example.
7 </para>
8
9 <para>To receive events, the events the user is interested in first must
10 be subscribed using the &VIDIOC-SUBSCRIBE-EVENT; ioctl. Once an event is
11 subscribed, the events of subscribed types are dequeueable using the
12 &VIDIOC-DQEVENT; ioctl. Events may be unsubscribed using
13 VIDIOC_UNSUBSCRIBE_EVENT ioctl. The special event type V4L2_EVENT_ALL may
14 be used to unsubscribe all the events the driver supports.</para>
15
16 <para>The event subscriptions and event queues are specific to file
17 handles. Subscribing an event on one file handle does not affect
18 other file handles.
19 </para>
20
21 <para>The information on dequeueable events is obtained by using select or
22 poll system calls on video devices. The V4L2 events use POLLPRI events on
23 poll system call and exceptions on select system call. </para>
24
25 <!--
26Local Variables:
27mode: sgml
28sgml-parent-document: "v4l2.sgml"
29indent-tabs-mode: nil
30End:
31 -->
diff --git a/Documentation/DocBook/v4l/io.xml b/Documentation/DocBook/v4l/io.xml
index f92f24323b2a..d424886beda0 100644
--- a/Documentation/DocBook/v4l/io.xml
+++ b/Documentation/DocBook/v4l/io.xml
@@ -589,7 +589,8 @@ number of a video input as in &v4l2-input; field
589 <entry></entry> 589 <entry></entry>
590 <entry>A place holder for future extensions and custom 590 <entry>A place holder for future extensions and custom
591(driver defined) buffer types 591(driver defined) buffer types
592<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher.</entry> 592<constant>V4L2_BUF_TYPE_PRIVATE</constant> and higher. Applications
593should set this to 0.</entry>
593 </row> 594 </row>
594 </tbody> 595 </tbody>
595 </tgroup> 596 </tgroup>
@@ -701,6 +702,16 @@ They can be both cleared however, then the buffer is in "dequeued"
701state, in the application domain to say so.</entry> 702state, in the application domain to say so.</entry>
702 </row> 703 </row>
703 <row> 704 <row>
705 <entry><constant>V4L2_BUF_FLAG_ERROR</constant></entry>
706 <entry>0x0040</entry>
707 <entry>When this flag is set, the buffer has been dequeued
708 successfully, although the data might have been corrupted.
709 This is recoverable, streaming may continue as normal and
710 the buffer may be reused normally.
711 Drivers set this flag when the <constant>VIDIOC_DQBUF</constant>
712 ioctl is called.</entry>
713 </row>
714 <row>
704 <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry> 715 <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry>
705 <entry>0x0008</entry> 716 <entry>0x0008</entry>
706 <entry>Drivers set or clear this flag when calling the 717 <entry>Drivers set or clear this flag when calling the
@@ -917,8 +928,8 @@ order</emphasis>.</para>
917 928
918 <para>When the driver provides or accepts images field by field 929 <para>When the driver provides or accepts images field by field
919rather than interleaved, it is also important applications understand 930rather than interleaved, it is also important applications understand
920how the fields combine to frames. We distinguish between top and 931how the fields combine to frames. We distinguish between top (aka odd) and
921bottom fields, the <emphasis>spatial order</emphasis>: The first line 932bottom (aka even) fields, the <emphasis>spatial order</emphasis>: The first line
922of the top field is the first line of an interlaced frame, the first 933of the top field is the first line of an interlaced frame, the first
923line of the bottom field is the second line of that frame.</para> 934line of the bottom field is the second line of that frame.</para>
924 935
@@ -971,12 +982,12 @@ between <constant>V4L2_FIELD_TOP</constant> and
971 <row> 982 <row>
972 <entry><constant>V4L2_FIELD_TOP</constant></entry> 983 <entry><constant>V4L2_FIELD_TOP</constant></entry>
973 <entry>2</entry> 984 <entry>2</entry>
974 <entry>Images consist of the top field only.</entry> 985 <entry>Images consist of the top (aka odd) field only.</entry>
975 </row> 986 </row>
976 <row> 987 <row>
977 <entry><constant>V4L2_FIELD_BOTTOM</constant></entry> 988 <entry><constant>V4L2_FIELD_BOTTOM</constant></entry>
978 <entry>3</entry> 989 <entry>3</entry>
979 <entry>Images consist of the bottom field only. 990 <entry>Images consist of the bottom (aka even) field only.
980Applications may wish to prevent a device from capturing interlaced 991Applications may wish to prevent a device from capturing interlaced
981images because they will have "comb" or "feathering" artefacts around 992images because they will have "comb" or "feathering" artefacts around
982moving objects.</entry> 993moving objects.</entry>
diff --git a/Documentation/DocBook/v4l/pixfmt.xml b/Documentation/DocBook/v4l/pixfmt.xml
index 885968d6a2fc..c4ad0a8e42dc 100644
--- a/Documentation/DocBook/v4l/pixfmt.xml
+++ b/Documentation/DocBook/v4l/pixfmt.xml
@@ -792,6 +792,18 @@ http://www.thedirks.org/winnov/</ulink></para></entry>
792 <entry>'YYUV'</entry> 792 <entry>'YYUV'</entry>
793 <entry>unknown</entry> 793 <entry>unknown</entry>
794 </row> 794 </row>
795 <row id="V4L2-PIX-FMT-Y4">
796 <entry><constant>V4L2_PIX_FMT_Y4</constant></entry>
797 <entry>'Y04 '</entry>
798 <entry>Old 4-bit greyscale format. Only the least significant 4 bits of each byte are used,
799the other bits are set to 0.</entry>
800 </row>
801 <row id="V4L2-PIX-FMT-Y6">
802 <entry><constant>V4L2_PIX_FMT_Y6</constant></entry>
803 <entry>'Y06 '</entry>
804 <entry>Old 6-bit greyscale format. Only the least significant 6 bits of each byte are used,
805the other bits are set to 0.</entry>
806 </row>
795 </tbody> 807 </tbody>
796 </tgroup> 808 </tgroup>
797 </table> 809 </table>
diff --git a/Documentation/DocBook/v4l/v4l2.xml b/Documentation/DocBook/v4l/v4l2.xml
index 060105af49e5..9737243377a3 100644
--- a/Documentation/DocBook/v4l/v4l2.xml
+++ b/Documentation/DocBook/v4l/v4l2.xml
@@ -401,6 +401,7 @@ and discussions on the V4L mailing list.</revremark>
401 <section id="ttx"> &sub-dev-teletext; </section> 401 <section id="ttx"> &sub-dev-teletext; </section>
402 <section id="radio"> &sub-dev-radio; </section> 402 <section id="radio"> &sub-dev-radio; </section>
403 <section id="rds"> &sub-dev-rds; </section> 403 <section id="rds"> &sub-dev-rds; </section>
404 <section id="event"> &sub-dev-event; </section>
404 </chapter> 405 </chapter>
405 406
406 <chapter id="driver"> 407 <chapter id="driver">
@@ -426,6 +427,7 @@ and discussions on the V4L mailing list.</revremark>
426 &sub-cropcap; 427 &sub-cropcap;
427 &sub-dbg-g-chip-ident; 428 &sub-dbg-g-chip-ident;
428 &sub-dbg-g-register; 429 &sub-dbg-g-register;
430 &sub-dqevent;
429 &sub-encoder-cmd; 431 &sub-encoder-cmd;
430 &sub-enumaudio; 432 &sub-enumaudio;
431 &sub-enumaudioout; 433 &sub-enumaudioout;
@@ -467,6 +469,7 @@ and discussions on the V4L mailing list.</revremark>
467 &sub-reqbufs; 469 &sub-reqbufs;
468 &sub-s-hw-freq-seek; 470 &sub-s-hw-freq-seek;
469 &sub-streamon; 471 &sub-streamon;
472 &sub-subscribe-event;
470 <!-- End of ioctls. --> 473 <!-- End of ioctls. -->
471 &sub-mmap; 474 &sub-mmap;
472 &sub-munmap; 475 &sub-munmap;
diff --git a/Documentation/DocBook/v4l/videodev2.h.xml b/Documentation/DocBook/v4l/videodev2.h.xml
index 068325940658..865b06d9e679 100644
--- a/Documentation/DocBook/v4l/videodev2.h.xml
+++ b/Documentation/DocBook/v4l/videodev2.h.xml
@@ -1018,6 +1018,13 @@ enum <link linkend="v4l2-colorfx">v4l2_colorfx</link> {
1018 V4L2_COLORFX_NONE = 0, 1018 V4L2_COLORFX_NONE = 0,
1019 V4L2_COLORFX_BW = 1, 1019 V4L2_COLORFX_BW = 1,
1020 V4L2_COLORFX_SEPIA = 2, 1020 V4L2_COLORFX_SEPIA = 2,
1021 V4L2_COLORFX_NEGATIVE = 3,
1022 V4L2_COLORFX_EMBOSS = 4,
1023 V4L2_COLORFX_SKETCH = 5,
1024 V4L2_COLORFX_SKY_BLUE = 6,
1025 V4L2_COLORFX_GRASS_GREEN = 7,
1026 V4L2_COLORFX_SKIN_WHITEN = 8,
1027 V4L2_COLORFX_VIVID = 9.
1021}; 1028};
1022#define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32) 1029#define V4L2_CID_AUTOBRIGHTNESS (V4L2_CID_BASE+32)
1023#define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33) 1030#define V4L2_CID_BAND_STOP_FILTER (V4L2_CID_BASE+33)
@@ -1271,6 +1278,9 @@ enum <link linkend="v4l2-exposure-auto-type">v4l2_exposure_auto_type</link> {
1271 1278
1272#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) 1279#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16)
1273 1280
1281#define V4L2_CID_IRIS_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+17)
1282#define V4L2_CID_IRIS_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+18)
1283
1274/* FM Modulator class control IDs */ 1284/* FM Modulator class control IDs */
1275#define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900) 1285#define V4L2_CID_FM_TX_CLASS_BASE (V4L2_CTRL_CLASS_FM_TX | 0x900)
1276#define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1) 1286#define V4L2_CID_FM_TX_CLASS (V4L2_CTRL_CLASS_FM_TX | 1)
diff --git a/Documentation/DocBook/v4l/vidioc-dqevent.xml b/Documentation/DocBook/v4l/vidioc-dqevent.xml
new file mode 100644
index 000000000000..4e0a7cc30812
--- /dev/null
+++ b/Documentation/DocBook/v4l/vidioc-dqevent.xml
@@ -0,0 +1,131 @@
1<refentry id="vidioc-dqevent">
2 <refmeta>
3 <refentrytitle>ioctl VIDIOC_DQEVENT</refentrytitle>
4 &manvol;
5 </refmeta>
6
7 <refnamediv>
8 <refname>VIDIOC_DQEVENT</refname>
9 <refpurpose>Dequeue event</refpurpose>
10 </refnamediv>
11
12 <refsynopsisdiv>
13 <funcsynopsis>
14 <funcprototype>
15 <funcdef>int <function>ioctl</function></funcdef>
16 <paramdef>int <parameter>fd</parameter></paramdef>
17 <paramdef>int <parameter>request</parameter></paramdef>
18 <paramdef>struct v4l2_event
19*<parameter>argp</parameter></paramdef>
20 </funcprototype>
21 </funcsynopsis>
22 </refsynopsisdiv>
23
24 <refsect1>
25 <title>Arguments</title>
26
27 <variablelist>
28 <varlistentry>
29 <term><parameter>fd</parameter></term>
30 <listitem>
31 <para>&fd;</para>
32 </listitem>
33 </varlistentry>
34 <varlistentry>
35 <term><parameter>request</parameter></term>
36 <listitem>
37 <para>VIDIOC_DQEVENT</para>
38 </listitem>
39 </varlistentry>
40 <varlistentry>
41 <term><parameter>argp</parameter></term>
42 <listitem>
43 <para></para>
44 </listitem>
45 </varlistentry>
46 </variablelist>
47 </refsect1>
48
49 <refsect1>
50 <title>Description</title>
51
52 <para>Dequeue an event from a video device. No input is required
53 for this ioctl. All the fields of the &v4l2-event; structure are
54 filled by the driver. The file handle will also receive exceptions
55 which the application may get by e.g. using the select system
56 call.</para>
57
58 <table frame="none" pgwide="1" id="v4l2-event">
59 <title>struct <structname>v4l2_event</structname></title>
60 <tgroup cols="4">
61 &cs-str;
62 <tbody valign="top">
63 <row>
64 <entry>__u32</entry>
65 <entry><structfield>type</structfield></entry>
66 <entry></entry>
67 <entry>Type of the event.</entry>
68 </row>
69 <row>
70 <entry>union</entry>
71 <entry><structfield>u</structfield></entry>
72 <entry></entry>
73 <entry></entry>
74 </row>
75 <row>
76 <entry></entry>
77 <entry>&v4l2-event-vsync;</entry>
78 <entry><structfield>vsync</structfield></entry>
79 <entry>Event data for event V4L2_EVENT_VSYNC.
80 </entry>
81 </row>
82 <row>
83 <entry></entry>
84 <entry>__u8</entry>
85 <entry><structfield>data</structfield>[64]</entry>
86 <entry>Event data. Defined by the event type. The union
87 should be used to define easily accessible type for
88 events.</entry>
89 </row>
90 <row>
91 <entry>__u32</entry>
92 <entry><structfield>pending</structfield></entry>
93 <entry></entry>
94 <entry>Number of pending events excluding this one.</entry>
95 </row>
96 <row>
97 <entry>__u32</entry>
98 <entry><structfield>sequence</structfield></entry>
99 <entry></entry>
100 <entry>Event sequence number. The sequence number is
101 incremented for every subscribed event that takes place.
102 If sequence numbers are not contiguous it means that
103 events have been lost.
104 </entry>
105 </row>
106 <row>
107 <entry>struct timespec</entry>
108 <entry><structfield>timestamp</structfield></entry>
109 <entry></entry>
110 <entry>Event timestamp.</entry>
111 </row>
112 <row>
113 <entry>__u32</entry>
114 <entry><structfield>reserved</structfield>[9]</entry>
115 <entry></entry>
116 <entry>Reserved for future extensions. Drivers must set
117 the array to zero.</entry>
118 </row>
119 </tbody>
120 </tgroup>
121 </table>
122
123 </refsect1>
124</refentry>
125<!--
126Local Variables:
127mode: sgml
128sgml-parent-document: "v4l2.sgml"
129indent-tabs-mode: nil
130End:
131-->
diff --git a/Documentation/DocBook/v4l/vidioc-enuminput.xml b/Documentation/DocBook/v4l/vidioc-enuminput.xml
index 71b868e2fb8f..476fe1d2bba0 100644
--- a/Documentation/DocBook/v4l/vidioc-enuminput.xml
+++ b/Documentation/DocBook/v4l/vidioc-enuminput.xml
@@ -283,7 +283,7 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009.
283 <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry> 283 <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry>
284 </row> 284 </row>
285 <row> 285 <row>
286 <entry><constant>V4L2_OUT_CAP_CUSTOM_TIMINGS</constant></entry> 286 <entry><constant>V4L2_IN_CAP_CUSTOM_TIMINGS</constant></entry>
287 <entry>0x00000002</entry> 287 <entry>0x00000002</entry>
288 <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry> 288 <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry>
289 </row> 289 </row>
diff --git a/Documentation/DocBook/v4l/vidioc-g-parm.xml b/Documentation/DocBook/v4l/vidioc-g-parm.xml
index 78332d365ce9..392aa9e5571e 100644
--- a/Documentation/DocBook/v4l/vidioc-g-parm.xml
+++ b/Documentation/DocBook/v4l/vidioc-g-parm.xml
@@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or
55duplicating on the driver side. This is especially useful when using 55duplicating on the driver side. This is especially useful when using
56the <function>read()</function> or <function>write()</function>, which 56the <function>read()</function> or <function>write()</function>, which
57are not augmented by timestamps or sequence counters, and to avoid 57are not augmented by timestamps or sequence counters, and to avoid
58unneccessary data copying.</para> 58unnecessary data copying.</para>
59 59
60 <para>Further these ioctls can be used to determine the number of 60 <para>Further these ioctls can be used to determine the number of
61buffers used internally by a driver in read/write mode. For 61buffers used internally by a driver in read/write mode. For
diff --git a/Documentation/DocBook/v4l/vidioc-qbuf.xml b/Documentation/DocBook/v4l/vidioc-qbuf.xml
index 187081778154..ab691ebf3b93 100644
--- a/Documentation/DocBook/v4l/vidioc-qbuf.xml
+++ b/Documentation/DocBook/v4l/vidioc-qbuf.xml
@@ -54,12 +54,10 @@ to enqueue an empty (capturing) or filled (output) buffer in the
54driver's incoming queue. The semantics depend on the selected I/O 54driver's incoming queue. The semantics depend on the selected I/O
55method.</para> 55method.</para>
56 56
57 <para>To enqueue a <link linkend="mmap">memory mapped</link> 57 <para>To enqueue a buffer applications set the <structfield>type</structfield>
58buffer applications set the <structfield>type</structfield> field of a 58field of a &v4l2-buffer; to the same buffer type as was previously used
59&v4l2-buffer; to the same buffer type as previously &v4l2-format; 59with &v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
60<structfield>type</structfield> and &v4l2-requestbuffers; 60<structfield>type</structfield>. Applications must also set the
61<structfield>type</structfield>, the <structfield>memory</structfield>
62field to <constant>V4L2_MEMORY_MMAP</constant> and the
63<structfield>index</structfield> field. Valid index numbers range from 61<structfield>index</structfield> field. Valid index numbers range from
64zero to the number of buffers allocated with &VIDIOC-REQBUFS; 62zero to the number of buffers allocated with &VIDIOC-REQBUFS;
65(&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The 63(&v4l2-requestbuffers; <structfield>count</structfield>) minus one. The
@@ -70,8 +68,19 @@ intended for output (<structfield>type</structfield> is
70<constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also 68<constant>V4L2_BUF_TYPE_VBI_OUTPUT</constant>) applications must also
71initialize the <structfield>bytesused</structfield>, 69initialize the <structfield>bytesused</structfield>,
72<structfield>field</structfield> and 70<structfield>field</structfield> and
73<structfield>timestamp</structfield> fields. See <xref 71<structfield>timestamp</structfield> fields, see <xref
74 linkend="buffer" /> for details. When 72linkend="buffer" /> for details.
73Applications must also set <structfield>flags</structfield> to 0. If a driver
74supports capturing from specific video inputs and you want to specify a video
75input, then <structfield>flags</structfield> should be set to
76<constant>V4L2_BUF_FLAG_INPUT</constant> and the field
77<structfield>input</structfield> must be initialized to the desired input.
78The <structfield>reserved</structfield> field must be set to 0.
79</para>
80
81 <para>To enqueue a <link linkend="mmap">memory mapped</link>
82buffer applications set the <structfield>memory</structfield>
83field to <constant>V4L2_MEMORY_MMAP</constant>. When
75<constant>VIDIOC_QBUF</constant> is called with a pointer to this 84<constant>VIDIOC_QBUF</constant> is called with a pointer to this
76structure the driver sets the 85structure the driver sets the
77<constant>V4L2_BUF_FLAG_MAPPED</constant> and 86<constant>V4L2_BUF_FLAG_MAPPED</constant> and
@@ -81,14 +90,10 @@ structure the driver sets the
81&EINVAL;.</para> 90&EINVAL;.</para>
82 91
83 <para>To enqueue a <link linkend="userp">user pointer</link> 92 <para>To enqueue a <link linkend="userp">user pointer</link>
84buffer applications set the <structfield>type</structfield> field of a 93buffer applications set the <structfield>memory</structfield>
85&v4l2-buffer; to the same buffer type as previously &v4l2-format; 94field to <constant>V4L2_MEMORY_USERPTR</constant>, the
86<structfield>type</structfield> and &v4l2-requestbuffers;
87<structfield>type</structfield>, the <structfield>memory</structfield>
88field to <constant>V4L2_MEMORY_USERPTR</constant> and the
89<structfield>m.userptr</structfield> field to the address of the 95<structfield>m.userptr</structfield> field to the address of the
90buffer and <structfield>length</structfield> to its size. When the 96buffer and <structfield>length</structfield> to its size.
91buffer is intended for output additional fields must be set as above.
92When <constant>VIDIOC_QBUF</constant> is called with a pointer to this 97When <constant>VIDIOC_QBUF</constant> is called with a pointer to this
93structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant> 98structure the driver sets the <constant>V4L2_BUF_FLAG_QUEUED</constant>
94flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and 99flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
@@ -96,16 +101,21 @@ flag and clears the <constant>V4L2_BUF_FLAG_MAPPED</constant> and
96<structfield>flags</structfield> field, or it returns an error code. 101<structfield>flags</structfield> field, or it returns an error code.
97This ioctl locks the memory pages of the buffer in physical memory, 102This ioctl locks the memory pages of the buffer in physical memory,
98they cannot be swapped out to disk. Buffers remain locked until 103they cannot be swapped out to disk. Buffers remain locked until
99dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl are 104dequeued, until the &VIDIOC-STREAMOFF; or &VIDIOC-REQBUFS; ioctl is
100called, or until the device is closed.</para> 105called, or until the device is closed.</para>
101 106
102 <para>Applications call the <constant>VIDIOC_DQBUF</constant> 107 <para>Applications call the <constant>VIDIOC_DQBUF</constant>
103ioctl to dequeue a filled (capturing) or displayed (output) buffer 108ioctl to dequeue a filled (capturing) or displayed (output) buffer
104from the driver's outgoing queue. They just set the 109from the driver's outgoing queue. They just set the
105<structfield>type</structfield> and <structfield>memory</structfield> 110<structfield>type</structfield>, <structfield>memory</structfield>
111and <structfield>reserved</structfield>
106fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant> 112fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant>
107is called with a pointer to this structure the driver fills the 113is called with a pointer to this structure the driver fills the
108remaining fields or returns an error code.</para> 114remaining fields or returns an error code. The driver may also set
115<constant>V4L2_BUF_FLAG_ERROR</constant> in the <structfield>flags</structfield>
116field. It indicates a non-critical (recoverable) streaming error. In such case
117the application may continue as normal, but should be aware that data in the
118dequeued buffer might be corrupted.</para>
109 119
110 <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no 120 <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no
111buffer is in the outgoing queue. When the 121buffer is in the outgoing queue. When the
@@ -152,7 +162,13 @@ enqueue a user pointer buffer.</para>
152 <para><constant>VIDIOC_DQBUF</constant> failed due to an 162 <para><constant>VIDIOC_DQBUF</constant> failed due to an
153internal error. Can also indicate temporary problems like signal 163internal error. Can also indicate temporary problems like signal
154loss. Note the driver might dequeue an (empty) buffer despite 164loss. Note the driver might dequeue an (empty) buffer despite
155returning an error, or even stop capturing.</para> 165returning an error, or even stop capturing. Reusing such buffer may be unsafe
166though and its details (e.g. <structfield>index</structfield>) may not be
167returned either. It is recommended that drivers indicate recoverable errors
168by setting the <constant>V4L2_BUF_FLAG_ERROR</constant> and returning 0 instead.
169In that case the application should be able to safely reuse the buffer and
170continue streaming.
171 </para>
156 </listitem> 172 </listitem>
157 </varlistentry> 173 </varlistentry>
158 </variablelist> 174 </variablelist>
diff --git a/Documentation/DocBook/v4l/vidioc-querybuf.xml b/Documentation/DocBook/v4l/vidioc-querybuf.xml
index d834993e6191..e649805a4908 100644
--- a/Documentation/DocBook/v4l/vidioc-querybuf.xml
+++ b/Documentation/DocBook/v4l/vidioc-querybuf.xml
@@ -54,12 +54,13 @@ buffer at any time after buffers have been allocated with the
54&VIDIOC-REQBUFS; ioctl.</para> 54&VIDIOC-REQBUFS; ioctl.</para>
55 55
56 <para>Applications set the <structfield>type</structfield> field 56 <para>Applications set the <structfield>type</structfield> field
57 of a &v4l2-buffer; to the same buffer type as previously 57 of a &v4l2-buffer; to the same buffer type as was previously used with
58&v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers; 58&v4l2-format; <structfield>type</structfield> and &v4l2-requestbuffers;
59<structfield>type</structfield>, and the <structfield>index</structfield> 59<structfield>type</structfield>, and the <structfield>index</structfield>
60 field. Valid index numbers range from zero 60 field. Valid index numbers range from zero
61to the number of buffers allocated with &VIDIOC-REQBUFS; 61to the number of buffers allocated with &VIDIOC-REQBUFS;
62 (&v4l2-requestbuffers; <structfield>count</structfield>) minus one. 62 (&v4l2-requestbuffers; <structfield>count</structfield>) minus one.
63The <structfield>reserved</structfield> field should to set to 0.
63After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to 64After calling <constant>VIDIOC_QUERYBUF</constant> with a pointer to
64 this structure drivers return an error code or fill the rest of 65 this structure drivers return an error code or fill the rest of
65the structure.</para> 66the structure.</para>
@@ -68,8 +69,8 @@ the structure.</para>
68<constant>V4L2_BUF_FLAG_MAPPED</constant>, 69<constant>V4L2_BUF_FLAG_MAPPED</constant>,
69<constant>V4L2_BUF_FLAG_QUEUED</constant> and 70<constant>V4L2_BUF_FLAG_QUEUED</constant> and
70<constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The 71<constant>V4L2_BUF_FLAG_DONE</constant> flags will be valid. The
71<structfield>memory</structfield> field will be set to 72<structfield>memory</structfield> field will be set to the current
72<constant>V4L2_MEMORY_MMAP</constant>, the <structfield>m.offset</structfield> 73I/O method, the <structfield>m.offset</structfield>
73contains the offset of the buffer from the start of the device memory, 74contains the offset of the buffer from the start of the device memory,
74the <structfield>length</structfield> field its size. The driver may 75the <structfield>length</structfield> field its size. The driver may
75or may not set the remaining fields and flags, they are meaningless in 76or may not set the remaining fields and flags, they are meaningless in
diff --git a/Documentation/DocBook/v4l/vidioc-queryctrl.xml b/Documentation/DocBook/v4l/vidioc-queryctrl.xml
index 4876ff1a1a04..8e0e055ac934 100644
--- a/Documentation/DocBook/v4l/vidioc-queryctrl.xml
+++ b/Documentation/DocBook/v4l/vidioc-queryctrl.xml
@@ -325,7 +325,7 @@ should be part of the control documentation.</entry>
325 <entry>n/a</entry> 325 <entry>n/a</entry>
326 <entry>This is not a control. When 326 <entry>This is not a control. When
327<constant>VIDIOC_QUERYCTRL</constant> is called with a control ID 327<constant>VIDIOC_QUERYCTRL</constant> is called with a control ID
328equal to a control class code (see <xref linkend="ctrl-class" />), the 328equal to a control class code (see <xref linkend="ctrl-class" />) + 1, the
329ioctl returns the name of the control class and this control type. 329ioctl returns the name of the control class and this control type.
330Older drivers which do not support this feature return an 330Older drivers which do not support this feature return an
331&EINVAL;.</entry> 331&EINVAL;.</entry>
diff --git a/Documentation/DocBook/v4l/vidioc-reqbufs.xml b/Documentation/DocBook/v4l/vidioc-reqbufs.xml
index bab38084454f..69800ae23348 100644
--- a/Documentation/DocBook/v4l/vidioc-reqbufs.xml
+++ b/Documentation/DocBook/v4l/vidioc-reqbufs.xml
@@ -54,23 +54,23 @@ I/O. Memory mapped buffers are located in device memory and must be
54allocated with this ioctl before they can be mapped into the 54allocated with this ioctl before they can be mapped into the
55application's address space. User buffers are allocated by 55application's address space. User buffers are allocated by
56applications themselves, and this ioctl is merely used to switch the 56applications themselves, and this ioctl is merely used to switch the
57driver into user pointer I/O mode.</para> 57driver into user pointer I/O mode and to setup some internal structures.</para>
58 58
59 <para>To allocate device buffers applications initialize three 59 <para>To allocate device buffers applications initialize all
60fields of a <structname>v4l2_requestbuffers</structname> structure. 60fields of the <structname>v4l2_requestbuffers</structname> structure.
61They set the <structfield>type</structfield> field to the respective 61They set the <structfield>type</structfield> field to the respective
62stream or buffer type, the <structfield>count</structfield> field to 62stream or buffer type, the <structfield>count</structfield> field to
63the desired number of buffers, and <structfield>memory</structfield> 63the desired number of buffers, <structfield>memory</structfield>
64must be set to <constant>V4L2_MEMORY_MMAP</constant>. When the ioctl 64must be set to the requested I/O method and the <structfield>reserved</structfield> array
65is called with a pointer to this structure the driver attempts to 65must be zeroed. When the ioctl
66allocate the requested number of buffers and stores the actual number 66is called with a pointer to this structure the driver will attempt to allocate
67the requested number of buffers and it stores the actual number
67allocated in the <structfield>count</structfield> field. It can be 68allocated in the <structfield>count</structfield> field. It can be
68smaller than the number requested, even zero, when the driver runs out 69smaller than the number requested, even zero, when the driver runs out
69of free memory. A larger number is possible when the driver requires 70of free memory. A larger number is also possible when the driver requires
70more buffers to function correctly.<footnote> 71more buffers to function correctly. For example video output requires at least two buffers,
71 <para>For example video output requires at least two buffers,
72one displayed and one filled by the application.</para> 72one displayed and one filled by the application.</para>
73 </footnote> When memory mapping I/O is not supported the ioctl 73 <para>When the I/O method is not supported the ioctl
74returns an &EINVAL;.</para> 74returns an &EINVAL;.</para>
75 75
76 <para>Applications can call <constant>VIDIOC_REQBUFS</constant> 76 <para>Applications can call <constant>VIDIOC_REQBUFS</constant>
@@ -81,14 +81,6 @@ in progress, an implicit &VIDIOC-STREAMOFF;. <!-- mhs: I see no
81reason why munmap()ping one or even all buffers must imply 81reason why munmap()ping one or even all buffers must imply
82streamoff.--></para> 82streamoff.--></para>
83 83
84 <para>To negotiate user pointer I/O, applications initialize only
85the <structfield>type</structfield> field and set
86<structfield>memory</structfield> to
87<constant>V4L2_MEMORY_USERPTR</constant>. When the ioctl is called
88with a pointer to this structure the driver prepares for user pointer
89I/O, when this I/O method is not supported the ioctl returns an
90&EINVAL;.</para>
91
92 <table pgwide="1" frame="none" id="v4l2-requestbuffers"> 84 <table pgwide="1" frame="none" id="v4l2-requestbuffers">
93 <title>struct <structname>v4l2_requestbuffers</structname></title> 85 <title>struct <structname>v4l2_requestbuffers</structname></title>
94 <tgroup cols="3"> 86 <tgroup cols="3">
@@ -97,9 +89,7 @@ I/O, when this I/O method is not supported the ioctl returns an
97 <row> 89 <row>
98 <entry>__u32</entry> 90 <entry>__u32</entry>
99 <entry><structfield>count</structfield></entry> 91 <entry><structfield>count</structfield></entry>
100 <entry>The number of buffers requested or granted. This 92 <entry>The number of buffers requested or granted.</entry>
101field is only used when <structfield>memory</structfield> is set to
102<constant>V4L2_MEMORY_MMAP</constant>.</entry>
103 </row> 93 </row>
104 <row> 94 <row>
105 <entry>&v4l2-buf-type;</entry> 95 <entry>&v4l2-buf-type;</entry>
@@ -120,7 +110,7 @@ as the &v4l2-format; <structfield>type</structfield> field. See <xref
120 <entry><structfield>reserved</structfield>[2]</entry> 110 <entry><structfield>reserved</structfield>[2]</entry>
121 <entry>A place holder for future extensions and custom 111 <entry>A place holder for future extensions and custom
122(driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and 112(driver defined) buffer types <constant>V4L2_BUF_TYPE_PRIVATE</constant> and
123higher.</entry> 113higher. This array should be zeroed by applications.</entry>
124 </row> 114 </row>
125 </tbody> 115 </tbody>
126 </tgroup> 116 </tgroup>
diff --git a/Documentation/DocBook/v4l/vidioc-subscribe-event.xml b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml
new file mode 100644
index 000000000000..8b501791aa68
--- /dev/null
+++ b/Documentation/DocBook/v4l/vidioc-subscribe-event.xml
@@ -0,0 +1,133 @@
1<refentry id="vidioc-subscribe-event">
2 <refmeta>
3 <refentrytitle>ioctl VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refentrytitle>
4 &manvol;
5 </refmeta>
6
7 <refnamediv>
8 <refname>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refname>
9 <refpurpose>Subscribe or unsubscribe event</refpurpose>
10 </refnamediv>
11
12 <refsynopsisdiv>
13 <funcsynopsis>
14 <funcprototype>
15 <funcdef>int <function>ioctl</function></funcdef>
16 <paramdef>int <parameter>fd</parameter></paramdef>
17 <paramdef>int <parameter>request</parameter></paramdef>
18 <paramdef>struct v4l2_event_subscription
19*<parameter>argp</parameter></paramdef>
20 </funcprototype>
21 </funcsynopsis>
22 </refsynopsisdiv>
23
24 <refsect1>
25 <title>Arguments</title>
26
27 <variablelist>
28 <varlistentry>
29 <term><parameter>fd</parameter></term>
30 <listitem>
31 <para>&fd;</para>
32 </listitem>
33 </varlistentry>
34 <varlistentry>
35 <term><parameter>request</parameter></term>
36 <listitem>
37 <para>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</para>
38 </listitem>
39 </varlistentry>
40 <varlistentry>
41 <term><parameter>argp</parameter></term>
42 <listitem>
43 <para></para>
44 </listitem>
45 </varlistentry>
46 </variablelist>
47 </refsect1>
48
49 <refsect1>
50 <title>Description</title>
51
52 <para>Subscribe or unsubscribe V4L2 event. Subscribed events are
53 dequeued by using the &VIDIOC-DQEVENT; ioctl.</para>
54
55 <table frame="none" pgwide="1" id="v4l2-event-subscription">
56 <title>struct <structname>v4l2_event_subscription</structname></title>
57 <tgroup cols="3">
58 &cs-str;
59 <tbody valign="top">
60 <row>
61 <entry>__u32</entry>
62 <entry><structfield>type</structfield></entry>
63 <entry>Type of the event.</entry>
64 </row>
65 <row>
66 <entry>__u32</entry>
67 <entry><structfield>reserved</structfield>[7]</entry>
68 <entry>Reserved for future extensions. Drivers and applications
69 must set the array to zero.</entry>
70 </row>
71 </tbody>
72 </tgroup>
73 </table>
74
75 <table frame="none" pgwide="1" id="event-type">
76 <title>Event Types</title>
77 <tgroup cols="3">
78 &cs-def;
79 <tbody valign="top">
80 <row>
81 <entry><constant>V4L2_EVENT_ALL</constant></entry>
82 <entry>0</entry>
83 <entry>All events. V4L2_EVENT_ALL is valid only for
84 VIDIOC_UNSUBSCRIBE_EVENT for unsubscribing all events at once.
85 </entry>
86 </row>
87 <row>
88 <entry><constant>V4L2_EVENT_VSYNC</constant></entry>
89 <entry>1</entry>
90 <entry>This event is triggered on the vertical sync.
91 This event has &v4l2-event-vsync; associated with it.
92 </entry>
93 </row>
94 <row>
95 <entry><constant>V4L2_EVENT_EOS</constant></entry>
96 <entry>2</entry>
97 <entry>This event is triggered when the end of a stream is reached.
98 This is typically used with MPEG decoders to report to the application
99 when the last of the MPEG stream has been decoded.
100 </entry>
101 </row>
102 <row>
103 <entry><constant>V4L2_EVENT_PRIVATE_START</constant></entry>
104 <entry>0x08000000</entry>
105 <entry>Base event number for driver-private events.</entry>
106 </row>
107 </tbody>
108 </tgroup>
109 </table>
110
111 <table frame="none" pgwide="1" id="v4l2-event-vsync">
112 <title>struct <structname>v4l2_event_vsync</structname></title>
113 <tgroup cols="3">
114 &cs-str;
115 <tbody valign="top">
116 <row>
117 <entry>__u8</entry>
118 <entry><structfield>field</structfield></entry>
119 <entry>The upcoming field. See &v4l2-field;.</entry>
120 </row>
121 </tbody>
122 </tgroup>
123 </table>
124
125 </refsect1>
126</refentry>
127<!--
128Local Variables:
129mode: sgml
130sgml-parent-document: "v4l2.sgml"
131indent-tabs-mode: nil
132End:
133-->
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl
index 0d0f7b4d4b1a..0ba149de2608 100644
--- a/Documentation/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -5518,34 +5518,41 @@ struct _snd_pcm_runtime {
5518]]> 5518]]>
5519 </programlisting> 5519 </programlisting>
5520 </informalexample> 5520 </informalexample>
5521
5522 For the raw data, <structfield>size</structfield> field must be
5523 set properly. This specifies the maximum size of the proc file access.
5521 </para> 5524 </para>
5522 5525
5523 <para> 5526 <para>
5524 The callback is much more complicated than the text-file 5527 The read/write callbacks of raw mode are more direct than the text mode.
5525 version. You need to use a low-level I/O functions such as 5528 You need to use a low-level I/O functions such as
5526 <function>copy_from/to_user()</function> to transfer the 5529 <function>copy_from/to_user()</function> to transfer the
5527 data. 5530 data.
5528 5531
5529 <informalexample> 5532 <informalexample>
5530 <programlisting> 5533 <programlisting>
5531<![CDATA[ 5534<![CDATA[
5532 static long my_file_io_read(struct snd_info_entry *entry, 5535 static ssize_t my_file_io_read(struct snd_info_entry *entry,
5533 void *file_private_data, 5536 void *file_private_data,
5534 struct file *file, 5537 struct file *file,
5535 char *buf, 5538 char *buf,
5536 unsigned long count, 5539 size_t count,
5537 unsigned long pos) 5540 loff_t pos)
5538 { 5541 {
5539 long size = count; 5542 if (copy_to_user(buf, local_data + pos, count))
5540 if (pos + size > local_max_size)
5541 size = local_max_size - pos;
5542 if (copy_to_user(buf, local_data + pos, size))
5543 return -EFAULT; 5543 return -EFAULT;
5544 return size; 5544 return count;
5545 } 5545 }
5546]]> 5546]]>
5547 </programlisting> 5547 </programlisting>
5548 </informalexample> 5548 </informalexample>
5549
5550 If the size of the info entry has been set up properly,
5551 <structfield>count</structfield> and <structfield>pos</structfield> are
5552 guaranteed to fit within 0 and the given size.
5553 You don't have to check the range in the callbacks unless any
5554 other condition is required.
5555
5549 </para> 5556 </para>
5550 5557
5551 </chapter> 5558 </chapter>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
index eeff19ca831b..bd97a13fa5ae 100644
--- a/Documentation/DocBook/writing_usb_driver.tmpl
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -342,7 +342,7 @@ static inline void skel_delete (struct usb_skel *dev)
342{ 342{
343 kfree (dev->bulk_in_buffer); 343 kfree (dev->bulk_in_buffer);
344 if (dev->bulk_out_buffer != NULL) 344 if (dev->bulk_out_buffer != NULL)
345 usb_buffer_free (dev->udev, dev->bulk_out_size, 345 usb_free_coherent (dev->udev, dev->bulk_out_size,
346 dev->bulk_out_buffer, 346 dev->bulk_out_buffer,
347 dev->write_urb->transfer_dma); 347 dev->write_urb->transfer_dma);
348 usb_free_urb (dev->write_urb); 348 usb_free_urb (dev->write_urb);
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index 8495fc970391..40ada93b820a 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -221,8 +221,8 @@ branches. These different branches are:
221 - main 2.6.x kernel tree 221 - main 2.6.x kernel tree
222 - 2.6.x.y -stable kernel tree 222 - 2.6.x.y -stable kernel tree
223 - 2.6.x -git kernel patches 223 - 2.6.x -git kernel patches
224 - 2.6.x -mm kernel patches
225 - subsystem specific kernel trees and patches 224 - subsystem specific kernel trees and patches
225 - the 2.6.x -next kernel tree for integration tests
226 226
2272.6.x kernel tree 2272.6.x kernel tree
228----------------- 228-----------------
@@ -232,9 +232,9 @@ process is as follows:
232 - As soon as a new kernel is released a two weeks window is open, 232 - As soon as a new kernel is released a two weeks window is open,
233 during this period of time maintainers can submit big diffs to 233 during this period of time maintainers can submit big diffs to
234 Linus, usually the patches that have already been included in the 234 Linus, usually the patches that have already been included in the
235 -mm kernel for a few weeks. The preferred way to submit big changes 235 -next kernel for a few weeks. The preferred way to submit big changes
236 is using git (the kernel's source management tool, more information 236 is using git (the kernel's source management tool, more information
237 can be found at http://git.or.cz/) but plain patches are also just 237 can be found at http://git-scm.com/) but plain patches are also just
238 fine. 238 fine.
239 - After two weeks a -rc1 kernel is released it is now possible to push 239 - After two weeks a -rc1 kernel is released it is now possible to push
240 only patches that do not include new features that could affect the 240 only patches that do not include new features that could affect the
@@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree. They are more
293experimental than -rc kernels since they are generated automatically 293experimental than -rc kernels since they are generated automatically
294without even a cursory glance to see if they are sane. 294without even a cursory glance to see if they are sane.
295 295
2962.6.x -mm kernel patches
297------------------------
298These are experimental kernel patches released by Andrew Morton. Andrew
299takes all of the different subsystem kernel trees and patches and mushes
300them together, along with a lot of patches that have been plucked from
301the linux-kernel mailing list. This tree serves as a proving ground for
302new features and patches. Once a patch has proved its worth in -mm for
303a while Andrew or the subsystem maintainer pushes it on to Linus for
304inclusion in mainline.
305
306It is heavily encouraged that all new patches get tested in the -mm tree
307before they are sent to Linus for inclusion in the main kernel tree. Code
308which does not make an appearance in -mm before the opening of the merge
309window will prove hard to merge into the mainline.
310
311These kernels are not appropriate for use on systems that are supposed
312to be stable and they are more risky to run than any of the other
313branches.
314
315If you wish to help out with the kernel development process, please test
316and use these kernel releases and provide feedback to the linux-kernel
317mailing list if you have any problems, and if everything works properly.
318
319In addition to all the other experimental patches, these kernels usually
320also contain any changes in the mainline -git kernels available at the
321time of release.
322
323The -mm kernels are not released on a fixed schedule, but usually a few
324-mm kernels are released in between each -rc kernel (1 to 3 is common).
325
326Subsystem Specific kernel trees and patches 296Subsystem Specific kernel trees and patches
327------------------------------------------- 297-------------------------------------------
328A number of the different kernel subsystem developers expose their 298The maintainers of the various kernel subsystems --- and also many
329development trees so that others can see what is happening in the 299kernel subsystem developers --- expose their current state of
330different areas of the kernel. These trees are pulled into the -mm 300development in source repositories. That way, others can see what is
331kernel releases as described above. 301happening in the different areas of the kernel. In areas where
332 302development is rapid, a developer may be asked to base his submissions
333Here is a list of some of the different kernel trees available: 303onto such a subsystem kernel tree so that conflicts between the
334 git trees: 304submission and other already ongoing work are avoided.
335 - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org> 305
336 git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git 306Most of these repositories are git trees, but there are also other SCMs
337 307in use, or patch queues being published as quilt series. Addresses of
338 - ACPI development tree, Len Brown <len.brown@intel.com> 308these subsystem repositories are listed in the MAINTAINERS file. Many
339 git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git 309of them can be browsed at http://git.kernel.org/.
340 310
341 - Block development tree, Jens Axboe <jens.axboe@oracle.com> 311Before a proposed patch is committed to such a subsystem tree, it is
342 git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git 312subject to review which primarily happens on mailing lists (see the
343 313respective section below). For several kernel subsystems, this review
344 - DRM development tree, Dave Airlie <airlied@linux.ie> 314process is tracked with the tool patchwork. Patchwork offers a web
345 git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git 315interface which shows patch postings, any comments on a patch or
346 316revisions to it, and maintainers can mark patches as under review,
347 - ia64 development tree, Tony Luck <tony.luck@intel.com> 317accepted, or rejected. Most of these patchwork sites are listed at
348 git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git 318http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/.
349 319
350 - infiniband, Roland Dreier <rolandd@cisco.com> 3202.6.x -next kernel tree for integration tests
351 git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git 321---------------------------------------------
352 322Before updates from subsystem trees are merged into the mainline 2.6.x
353 - libata, Jeff Garzik <jgarzik@pobox.com> 323tree, they need to be integration-tested. For this purpose, a special
354 git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git 324testing repository exists into which virtually all subsystem trees are
355 325pulled on an almost daily basis:
356 - network drivers, Jeff Garzik <jgarzik@pobox.com> 326 http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git
357 git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git 327 http://linux.f-seidel.de/linux-next/pmwiki/
358 328
359 - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net> 329This way, the -next kernel gives a summary outlook onto what will be
360 git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git 330expected to go into the mainline kernel at the next merge period.
361 331Adventurous testers are very welcome to runtime-test the -next kernel.
362 - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
363 git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
364
365 - x86, Ingo Molnar <mingo@elte.hu>
366 git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
367
368 quilt trees:
369 - USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
370 kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
371 332
372 Other kernel trees can be found listed at http://git.kernel.org/ and in
373 the MAINTAINERS file.
374 333
375Bug Reporting 334Bug Reporting
376------------- 335-------------
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index bc38283379f0..69dd29ed824e 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -365,6 +365,7 @@ You can change this at module load time (for a module) with:
365 regshifts=<shift1>,<shift2>,... 365 regshifts=<shift1>,<shift2>,...
366 slave_addrs=<addr1>,<addr2>,... 366 slave_addrs=<addr1>,<addr2>,...
367 force_kipmid=<enable1>,<enable2>,... 367 force_kipmid=<enable1>,<enable2>,...
368 kipmid_max_busy_us=<ustime1>,<ustime2>,...
368 unload_when_empty=[0|1] 369 unload_when_empty=[0|1]
369 370
370Each of these except si_trydefaults is a list, the first item for the 371Each of these except si_trydefaults is a list, the first item for the
@@ -433,6 +434,7 @@ kernel command line as:
433 ipmi_si.regshifts=<shift1>,<shift2>,... 434 ipmi_si.regshifts=<shift1>,<shift2>,...
434 ipmi_si.slave_addrs=<addr1>,<addr2>,... 435 ipmi_si.slave_addrs=<addr1>,<addr2>,...
435 ipmi_si.force_kipmid=<enable1>,<enable2>,... 436 ipmi_si.force_kipmid=<enable1>,<enable2>,...
437 ipmi_si.kipmid_max_busy_us=<ustime1>,<ustime2>,...
436 438
437It works the same as the module parameters of the same names. 439It works the same as the module parameters of the same names.
438 440
@@ -450,6 +452,16 @@ force this thread on or off. If you force it off and don't have
450interrupts, the driver will run VERY slowly. Don't blame me, 452interrupts, the driver will run VERY slowly. Don't blame me,
451these interfaces suck. 453these interfaces suck.
452 454
455Unfortunately, this thread can use a lot of CPU depending on the
456interface's performance. This can waste a lot of CPU and cause
457various issues with detecting idle CPU and using extra power. To
458avoid this, the kipmid_max_busy_us sets the maximum amount of time, in
459microseconds, that kipmid will spin before sleeping for a tick. This
460value sets a balance between performance and CPU waste and needs to be
461tuned to your needs. Maybe, someday, auto-tuning will be added, but
462that's not a simple thing and even the auto-tuning would need to be
463tuned to the user's desired performance.
464
453The driver supports a hot add and remove of interfaces. This way, 465The driver supports a hot add and remove of interfaces. This way,
454interfaces can be added or removed after the kernel is up and running. 466interfaces can be added or removed after the kernel is up and running.
455This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a 467This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
diff --git a/Documentation/Makefile b/Documentation/Makefile
index 94b945733534..6fc7ea1d1f9d 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1,3 +1,3 @@
1obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ 1obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
2 filesystems/configfs/ ia64/ networking/ \ 2 filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
3 pcmcia/ spi/ video4linux/ vm/ watchdog/src/ 3 pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
index e83f2ea76415..898ded24510d 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -216,7 +216,7 @@ The driver should return one of the following result codes:
216 216
217 - PCI_ERS_RESULT_NEED_RESET 217 - PCI_ERS_RESULT_NEED_RESET
218 Driver returns this if it thinks the device is not 218 Driver returns this if it thinks the device is not
219 recoverable in it's current state and it needs a slot 219 recoverable in its current state and it needs a slot
220 reset to proceed. 220 reset to proceed.
221 221
222 - PCI_ERS_RESULT_DISCONNECT 222 - PCI_ERS_RESULT_DISCONNECT
@@ -241,7 +241,7 @@ in working condition.
241 241
242The driver is not supposed to restart normal driver I/O operations 242The driver is not supposed to restart normal driver I/O operations
243at this point. It should limit itself to "probing" the device to 243at this point. It should limit itself to "probing" the device to
244check it's recoverability status. If all is right, then the platform 244check its recoverability status. If all is right, then the platform
245will call resume() once all drivers have ack'd link_reset(). 245will call resume() once all drivers have ack'd link_reset().
246 246
247 Result codes: 247 Result codes:
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index be21001ab144..26d3d945c3c2 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -13,7 +13,7 @@ Reporting (AER) driver and provides information on how to use it, as
13well as how to enable the drivers of endpoint devices to conform with 13well as how to enable the drivers of endpoint devices to conform with
14PCI Express AER driver. 14PCI Express AER driver.
15 15
161.2 Copyright © Intel Corporation 2006. 161.2 Copyright (C) Intel Corporation 2006.
17 17
181.3 What is the PCI Express AER Driver? 181.3 What is the PCI Express AER Driver?
19 19
@@ -71,15 +71,11 @@ console. If it's a correctable error, it is outputed as a warning.
71Otherwise, it is printed as an error. So users could choose different 71Otherwise, it is printed as an error. So users could choose different
72log level to filter out correctable error messages. 72log level to filter out correctable error messages.
73 73
74Below shows an example. 74Below shows an example:
75+------ PCI-Express Device Error -----+ 750000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
76Error Severity : Uncorrected (Fatal) 760000:50:00.0: device [8086:0329] error status/mask=00100000/00000000
77PCIE Bus Error type : Transaction Layer 770000:50:00.0: [20] Unsupported Request (First)
78Unsupported Request : First 780000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100
79Requester ID : 0500
80VendorID=8086h, DeviceID=0329h, Bus=05h, Device=00h, Function=00h
81TLB Header:
8204000001 00200a03 05010000 00050100
83 79
84In the example, 'Requester ID' means the ID of the device who sends 80In the example, 'Requester ID' means the ID of the device who sends
85the error message to root port. Pls. refer to pci express specs for 81the error message to root port. Pls. refer to pci express specs for
@@ -112,7 +108,7 @@ but the PCI Express link itself is fully functional. Fatal errors, on
112the other hand, cause the link to be unreliable. 108the other hand, cause the link to be unreliable.
113 109
114When AER is enabled, a PCI Express device will automatically send an 110When AER is enabled, a PCI Express device will automatically send an
115error message to the PCIE root port above it when the device captures 111error message to the PCIe root port above it when the device captures
116an error. The Root Port, upon receiving an error reporting message, 112an error. The Root Port, upon receiving an error reporting message,
117internally processes and logs the error message in its PCI Express 113internally processes and logs the error message in its PCI Express
118capability structure. Error information being logged includes storing 114capability structure. Error information being logged includes storing
@@ -198,8 +194,9 @@ to reset link, AER port service driver is required to provide the
198function to reset link. Firstly, kernel looks for if the upstream 194function to reset link. Firstly, kernel looks for if the upstream
199component has an aer driver. If it has, kernel uses the reset_link 195component has an aer driver. If it has, kernel uses the reset_link
200callback of the aer driver. If the upstream component has no aer driver 196callback of the aer driver. If the upstream component has no aer driver
201and the port is downstream port, we will use the aer driver of the 197and the port is downstream port, we will perform a hot reset as the
202root port who reports the AER error. As for upstream ports, 198default by setting the Secondary Bus Reset bit of the Bridge Control
199register associated with the downstream port. As for upstream ports,
203they should provide their own aer service drivers with reset_link 200they should provide their own aer service drivers with reset_link
204function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and 201function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
205reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes 202reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
@@ -253,11 +250,11 @@ cleanup uncorrectable status register. Pls. refer to section 3.3.
253 250
2544. Software error injection 2514. Software error injection
255 252
256Debugging PCIE AER error recovery code is quite difficult because it 253Debugging PCIe AER error recovery code is quite difficult because it
257is hard to trigger real hardware errors. Software based error 254is hard to trigger real hardware errors. Software based error
258injection can be used to fake various kinds of PCIE errors. 255injection can be used to fake various kinds of PCIe errors.
259 256
260First you should enable PCIE AER software error injection in kernel 257First you should enable PCIe AER software error injection in kernel
261configuration, that is, following item should be in your .config. 258configuration, that is, following item should be in your .config.
262 259
263CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m 260CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 9bb62f7b89c3..71b6f500ddb9 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -6,16 +6,22 @@ checklist.txt
6 - Review Checklist for RCU Patches 6 - Review Checklist for RCU Patches
7listRCU.txt 7listRCU.txt
8 - Using RCU to Protect Read-Mostly Linked Lists 8 - Using RCU to Protect Read-Mostly Linked Lists
9lockdep.txt
10 - RCU and lockdep checking
9NMI-RCU.txt 11NMI-RCU.txt
10 - Using RCU to Protect Dynamic NMI Handlers 12 - Using RCU to Protect Dynamic NMI Handlers
13rcubarrier.txt
14 - RCU and Unloadable Modules
15rculist_nulls.txt
16 - RCU list primitives for use with SLAB_DESTROY_BY_RCU
11rcuref.txt 17rcuref.txt
12 - Reference-count design for elements of lists/arrays protected by RCU 18 - Reference-count design for elements of lists/arrays protected by RCU
13rcu.txt 19rcu.txt
14 - RCU Concepts 20 - RCU Concepts
15rcubarrier.txt
16 - Unloading modules that use RCU callbacks
17RTFP.txt 21RTFP.txt
18 - List of RCU papers (bibliography) going back to 1980. 22 - List of RCU papers (bibliography) going back to 1980.
23stallwarn.txt
24 - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR)
19torture.txt 25torture.txt
20 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) 26 - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
21trace.txt 27trace.txt
diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
index a6d32e65d222..a8536cb88091 100644
--- a/Documentation/RCU/NMI-RCU.txt
+++ b/Documentation/RCU/NMI-RCU.txt
@@ -34,7 +34,7 @@ NMI handler.
34 cpu = smp_processor_id(); 34 cpu = smp_processor_id();
35 ++nmi_count(cpu); 35 ++nmi_count(cpu);
36 36
37 if (!rcu_dereference(nmi_callback)(regs, cpu)) 37 if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
38 default_do_nmi(regs); 38 default_do_nmi(regs);
39 39
40 nmi_exit(); 40 nmi_exit();
@@ -47,12 +47,13 @@ function pointer. If this handler returns zero, do_nmi() invokes the
47default_do_nmi() function to handle a machine-specific NMI. Finally, 47default_do_nmi() function to handle a machine-specific NMI. Finally,
48preemption is restored. 48preemption is restored.
49 49
50Strictly speaking, rcu_dereference() is not needed, since this code runs 50In theory, rcu_dereference_sched() is not needed, since this code runs
51only on i386, which does not need rcu_dereference() anyway. However, 51only on i386, which in theory does not need rcu_dereference_sched()
52it is a good documentation aid, particularly for anyone attempting to 52anyway. However, in practice it is a good documentation aid, particularly
53do something similar on Alpha. 53for anyone attempting to do something similar on Alpha or on systems
54with aggressive optimizing compilers.
54 55
55Quick Quiz: Why might the rcu_dereference() be necessary on Alpha, 56Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
56 given that the code referenced by the pointer is read-only? 57 given that the code referenced by the pointer is read-only?
57 58
58 59
@@ -99,17 +100,21 @@ invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
99 100
100Answer to Quick Quiz 101Answer to Quick Quiz
101 102
102 Why might the rcu_dereference() be necessary on Alpha, given 103 Why might the rcu_dereference_sched() be necessary on Alpha, given
103 that the code referenced by the pointer is read-only? 104 that the code referenced by the pointer is read-only?
104 105
105 Answer: The caller to set_nmi_callback() might well have 106 Answer: The caller to set_nmi_callback() might well have
106 initialized some data that is to be used by the 107 initialized some data that is to be used by the new NMI
107 new NMI handler. In this case, the rcu_dereference() 108 handler. In this case, the rcu_dereference_sched() would
108 would be needed, because otherwise a CPU that received 109 be needed, because otherwise a CPU that received an NMI
109 an NMI just after the new handler was set might see 110 just after the new handler was set might see the pointer
110 the pointer to the new NMI handler, but the old 111 to the new NMI handler, but the old pre-initialized
111 pre-initialized version of the handler's data. 112 version of the handler's data.
112 113
113 More important, the rcu_dereference() makes it clear 114 This same sad story can happen on other CPUs when using
114 to someone reading the code that the pointer is being 115 a compiler with aggressive pointer-value speculation
115 protected by RCU. 116 optimizations.
117
118 More important, the rcu_dereference_sched() makes it
119 clear to someone reading the code that the pointer is
120 being protected by RCU-sched.
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index d2b85237c76e..5aea459e3dd6 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -25,10 +25,10 @@ to be referencing the data structure. However, this mechanism was not
25optimized for modern computer systems, which is not surprising given 25optimized for modern computer systems, which is not surprising given
26that these overheads were not so expensive in the mid-80s. Nonetheless, 26that these overheads were not so expensive in the mid-80s. Nonetheless,
27passive serialization appears to be the first deferred-destruction 27passive serialization appears to be the first deferred-destruction
28mechanism to be used in production. Furthermore, the relevant patent has 28mechanism to be used in production. Furthermore, the relevant patent
29lapsed, so this approach may be used in non-GPL software, if desired. 29has lapsed, so this approach may be used in non-GPL software, if desired.
30(In contrast, use of RCU is permitted only in software licensed under 30(In contrast, implementation of RCU is permitted only in software licensed
31GPL. Sorry!!!) 31under either GPL or LGPL. Sorry!!!)
32 32
33In 1990, Pugh [Pugh90] noted that explicitly tracking which threads 33In 1990, Pugh [Pugh90] noted that explicitly tracking which threads
34were reading a given data structure permitted deferred free to operate 34were reading a given data structure permitted deferred free to operate
@@ -150,6 +150,18 @@ preemptible RCU [PaulEMcKenney2007PreemptibleRCU], and the three-part
150LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally, 150LWN "What is RCU?" series [PaulEMcKenney2007WhatIsRCUFundamentally,
151PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI]. 151PaulEMcKenney2008WhatIsRCUUsage, and PaulEMcKenney2008WhatIsRCUAPI].
152 152
1532008 saw a journal paper on real-time RCU [DinakarGuniguntala2008IBMSysJ],
154a history of how Linux changed RCU more than RCU changed Linux
155[PaulEMcKenney2008RCUOSR], and a design overview of hierarchical RCU
156[PaulEMcKenney2008HierarchicalRCU].
157
1582009 introduced user-level RCU algorithms [PaulEMcKenney2009MaliciousURCU],
159which Mathieu Desnoyers is now maintaining [MathieuDesnoyers2009URCU]
160[MathieuDesnoyersPhD]. TINY_RCU [PaulEMcKenney2009BloatWatchRCU] made
161its appearance, as did expedited RCU [PaulEMcKenney2009expeditedRCU].
162The problem of resizeable RCU-protected hash tables may now be on a path
163to a solution [JoshTriplett2009RPHash].
164
153Bibtex Entries 165Bibtex Entries
154 166
155@article{Kung80 167@article{Kung80
@@ -730,6 +742,11 @@ Revised:
730" 742"
731} 743}
732 744
745#
746# "What is RCU?" LWN series.
747#
748########################################################################
749
733@article{DinakarGuniguntala2008IBMSysJ 750@article{DinakarGuniguntala2008IBMSysJ
734,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole" 751,author="D. Guniguntala and P. E. McKenney and J. Triplett and J. Walpole"
735,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}" 752,title="The read-copy-update mechanism for supporting real-time applications on shared-memory multiprocessor systems with {Linux}"
@@ -820,3 +837,39 @@ Revised:
820 Uniprocessor assumptions allow simplified RCU implementation. 837 Uniprocessor assumptions allow simplified RCU implementation.
821" 838"
822} 839}
840
841@unpublished{PaulEMcKenney2009expeditedRCU
842,Author="Paul E. McKenney"
843,Title="[{PATCH} -tip 0/3] expedited 'big hammer' {RCU} grace periods"
844,month="June"
845,day="25"
846,year="2009"
847,note="Available:
848\url{http://lkml.org/lkml/2009/6/25/306}
849[Viewed August 16, 2009]"
850,annotation="
851 First posting of expedited RCU to be accepted into -tip.
852"
853}
854
855@unpublished{JoshTriplett2009RPHash
856,Author="Josh Triplett"
857,Title="Scalable concurrent hash tables via relativistic programming"
858,month="September"
859,year="2009"
860,note="Linux Plumbers Conference presentation"
861,annotation="
862 RP fun with hash tables.
863"
864}
865
866@phdthesis{MathieuDesnoyersPhD
867, title = "Low-Impact Operating System Tracing"
868, author = "Mathieu Desnoyers"
869, school = "Ecole Polytechnique de Montr\'{e}al"
870, month = "December"
871, year = 2009
872,note="Available:
873\url{http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf}
874[Viewed December 9, 2009]"
875}
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 51525a30e8b4..790d1a812376 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -8,13 +8,12 @@ would cause. This list is based on experiences reviewing such patches
8over a rather long period of time, but improvements are always welcome! 8over a rather long period of time, but improvements are always welcome!
9 9
100. Is RCU being applied to a read-mostly situation? If the data 100. Is RCU being applied to a read-mostly situation? If the data
11 structure is updated more than about 10% of the time, then 11 structure is updated more than about 10% of the time, then you
12 you should strongly consider some other approach, unless 12 should strongly consider some other approach, unless detailed
13 detailed performance measurements show that RCU is nonetheless 13 performance measurements show that RCU is nonetheless the right
14 the right tool for the job. Yes, you might think of RCU 14 tool for the job. Yes, RCU does reduce read-side overhead by
15 as simply cutting overhead off of the readers and imposing it 15 increasing write-side overhead, which is exactly why normal uses
16 on the writers. That is exactly why normal uses of RCU will 16 of RCU will do much more reading than updating.
17 do much more reading than updating.
18 17
19 Another exception is where performance is not an issue, and RCU 18 Another exception is where performance is not an issue, and RCU
20 provides a simpler implementation. An example of this situation 19 provides a simpler implementation. An example of this situation
@@ -35,13 +34,13 @@ over a rather long period of time, but improvements are always welcome!
35 34
36 If you choose #b, be prepared to describe how you have handled 35 If you choose #b, be prepared to describe how you have handled
37 memory barriers on weakly ordered machines (pretty much all of 36 memory barriers on weakly ordered machines (pretty much all of
38 them -- even x86 allows reads to be reordered), and be prepared 37 them -- even x86 allows later loads to be reordered to precede
39 to explain why this added complexity is worthwhile. If you 38 earlier stores), and be prepared to explain why this added
40 choose #c, be prepared to explain how this single task does not 39 complexity is worthwhile. If you choose #c, be prepared to
41 become a major bottleneck on big multiprocessor machines (for 40 explain how this single task does not become a major bottleneck on
42 example, if the task is updating information relating to itself 41 big multiprocessor machines (for example, if the task is updating
43 that other tasks can read, there by definition can be no 42 information relating to itself that other tasks can read, there
44 bottleneck). 43 by definition can be no bottleneck).
45 44
462. Do the RCU read-side critical sections make proper use of 452. Do the RCU read-side critical sections make proper use of
47 rcu_read_lock() and friends? These primitives are needed 46 rcu_read_lock() and friends? These primitives are needed
@@ -51,8 +50,10 @@ over a rather long period of time, but improvements are always welcome!
51 actuarial risk of your kernel. 50 actuarial risk of your kernel.
52 51
53 As a rough rule of thumb, any dereference of an RCU-protected 52 As a rough rule of thumb, any dereference of an RCU-protected
54 pointer must be covered by rcu_read_lock() or rcu_read_lock_bh() 53 pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
55 or by the appropriate update-side lock. 54 rcu_read_lock_sched(), or by the appropriate update-side lock.
55 Disabling of preemption can serve as rcu_read_lock_sched(), but
56 is less readable.
56 57
573. Does the update code tolerate concurrent accesses? 583. Does the update code tolerate concurrent accesses?
58 59
@@ -62,25 +63,27 @@ over a rather long period of time, but improvements are always welcome!
62 of ways to handle this concurrency, depending on the situation: 63 of ways to handle this concurrency, depending on the situation:
63 64
64 a. Use the RCU variants of the list and hlist update 65 a. Use the RCU variants of the list and hlist update
65 primitives to add, remove, and replace elements on an 66 primitives to add, remove, and replace elements on
66 RCU-protected list. Alternatively, use the RCU-protected 67 an RCU-protected list. Alternatively, use the other
67 trees that have been added to the Linux kernel. 68 RCU-protected data structures that have been added to
69 the Linux kernel.
68 70
69 This is almost always the best approach. 71 This is almost always the best approach.
70 72
71 b. Proceed as in (a) above, but also maintain per-element 73 b. Proceed as in (a) above, but also maintain per-element
72 locks (that are acquired by both readers and writers) 74 locks (that are acquired by both readers and writers)
73 that guard per-element state. Of course, fields that 75 that guard per-element state. Of course, fields that
74 the readers refrain from accessing can be guarded by the 76 the readers refrain from accessing can be guarded by
75 update-side lock. 77 some other lock acquired only by updaters, if desired.
76 78
77 This works quite well, also. 79 This works quite well, also.
78 80
79 c. Make updates appear atomic to readers. For example, 81 c. Make updates appear atomic to readers. For example,
80 pointer updates to properly aligned fields will appear 82 pointer updates to properly aligned fields will
81 atomic, as will individual atomic primitives. Operations 83 appear atomic, as will individual atomic primitives.
82 performed under a lock and sequences of multiple atomic 84 Sequences of perations performed under a lock will -not-
83 primitives will -not- appear to be atomic. 85 appear to be atomic to RCU readers, nor will sequences
86 of multiple atomic primitives.
84 87
85 This can work, but is starting to get a bit tricky. 88 This can work, but is starting to get a bit tricky.
86 89
@@ -98,9 +101,9 @@ over a rather long period of time, but improvements are always welcome!
98 a new structure containing updated values. 101 a new structure containing updated values.
99 102
1004. Weakly ordered CPUs pose special challenges. Almost all CPUs 1034. Weakly ordered CPUs pose special challenges. Almost all CPUs
101 are weakly ordered -- even i386 CPUs allow reads to be reordered. 104 are weakly ordered -- even x86 CPUs allow later loads to be
102 RCU code must take all of the following measures to prevent 105 reordered to precede earlier stores. RCU code must take all of
103 memory-corruption problems: 106 the following measures to prevent memory-corruption problems:
104 107
105 a. Readers must maintain proper ordering of their memory 108 a. Readers must maintain proper ordering of their memory
106 accesses. The rcu_dereference() primitive ensures that 109 accesses. The rcu_dereference() primitive ensures that
@@ -113,14 +116,25 @@ over a rather long period of time, but improvements are always welcome!
113 The rcu_dereference() primitive is also an excellent 116 The rcu_dereference() primitive is also an excellent
114 documentation aid, letting the person reading the code 117 documentation aid, letting the person reading the code
115 know exactly which pointers are protected by RCU. 118 know exactly which pointers are protected by RCU.
116 119 Please note that compilers can also reorder code, and
117 The rcu_dereference() primitive is used by the various 120 they are becoming increasingly aggressive about doing
118 "_rcu()" list-traversal primitives, such as the 121 just that. The rcu_dereference() primitive therefore
119 list_for_each_entry_rcu(). Note that it is perfectly 122 also prevents destructive compiler optimizations.
120 legal (if redundant) for update-side code to use 123
121 rcu_dereference() and the "_rcu()" list-traversal 124 The rcu_dereference() primitive is used by the
122 primitives. This is particularly useful in code 125 various "_rcu()" list-traversal primitives, such
123 that is common to readers and updaters. 126 as the list_for_each_entry_rcu(). Note that it is
127 perfectly legal (if redundant) for update-side code to
128 use rcu_dereference() and the "_rcu()" list-traversal
129 primitives. This is particularly useful in code that
130 is common to readers and updaters. However, lockdep
131 will complain if you access rcu_dereference() outside
132 of an RCU read-side critical section. See lockdep.txt
133 to learn what to do about this.
134
135 Of course, neither rcu_dereference() nor the "_rcu()"
136 list-traversal primitives can substitute for a good
137 concurrency design coordinating among multiple updaters.
124 138
125 b. If the list macros are being used, the list_add_tail_rcu() 139 b. If the list macros are being used, the list_add_tail_rcu()
126 and list_add_rcu() primitives must be used in order 140 and list_add_rcu() primitives must be used in order
@@ -135,11 +149,14 @@ over a rather long period of time, but improvements are always welcome!
135 readers. Similarly, if the hlist macros are being used, 149 readers. Similarly, if the hlist macros are being used,
136 the hlist_del_rcu() primitive is required. 150 the hlist_del_rcu() primitive is required.
137 151
138 The list_replace_rcu() primitive may be used to 152 The list_replace_rcu() and hlist_replace_rcu() primitives
139 replace an old structure with a new one in an 153 may be used to replace an old structure with a new one
140 RCU-protected list. 154 in their respective types of RCU-protected lists.
155
156 d. Rules similar to (4b) and (4c) apply to the "hlist_nulls"
157 type of RCU-protected linked lists.
141 158
142 d. Updates must ensure that initialization of a given 159 e. Updates must ensure that initialization of a given
143 structure happens before pointers to that structure are 160 structure happens before pointers to that structure are
144 publicized. Use the rcu_assign_pointer() primitive 161 publicized. Use the rcu_assign_pointer() primitive
145 when publicizing a pointer to a structure that can 162 when publicizing a pointer to a structure that can
@@ -151,16 +168,31 @@ over a rather long period of time, but improvements are always welcome!
151 it cannot block. 168 it cannot block.
152 169
1536. Since synchronize_rcu() can block, it cannot be called from 1706. Since synchronize_rcu() can block, it cannot be called from
154 any sort of irq context. Ditto for synchronize_sched() and 171 any sort of irq context. The same rule applies for
155 synchronize_srcu(). 172 synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
156 173 synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
1577. If the updater uses call_rcu(), then the corresponding readers 174 synchronize_sched_expedite(), and synchronize_srcu_expedited().
158 must use rcu_read_lock() and rcu_read_unlock(). If the updater 175
159 uses call_rcu_bh(), then the corresponding readers must use 176 The expedited forms of these primitives have the same semantics
160 rcu_read_lock_bh() and rcu_read_unlock_bh(). If the updater 177 as the non-expedited forms, but expediting is both expensive
161 uses call_rcu_sched(), then the corresponding readers must 178 and unfriendly to real-time workloads. Use of the expedited
162 disable preemption. Mixing things up will result in confusion 179 primitives should be restricted to rare configuration-change
163 and broken kernels. 180 operations that would not normally be undertaken while a real-time
181 workload is running.
182
1837. If the updater uses call_rcu() or synchronize_rcu(), then the
184 corresponding readers must use rcu_read_lock() and
185 rcu_read_unlock(). If the updater uses call_rcu_bh() or
186 synchronize_rcu_bh(), then the corresponding readers must
187 use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the
188 updater uses call_rcu_sched() or synchronize_sched(), then
189 the corresponding readers must disable preemption, possibly
190 by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
191 If the updater uses synchronize_srcu(), the the corresponding
192 readers must use srcu_read_lock() and srcu_read_unlock(),
193 and with the same srcu_struct. The rules for the expedited
194 primitives are the same as for their non-expedited counterparts.
195 Mixing things up will result in confusion and broken kernels.
164 196
165 One exception to this rule: rcu_read_lock() and rcu_read_unlock() 197 One exception to this rule: rcu_read_lock() and rcu_read_unlock()
166 may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh() 198 may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
@@ -212,6 +244,8 @@ over a rather long period of time, but improvements are always welcome!
212 e. Periodically invoke synchronize_rcu(), permitting a limited 244 e. Periodically invoke synchronize_rcu(), permitting a limited
213 number of updates per grace period. 245 number of updates per grace period.
214 246
247 The same cautions apply to call_rcu_bh() and call_rcu_sched().
248
2159. All RCU list-traversal primitives, which include 2499. All RCU list-traversal primitives, which include
216 rcu_dereference(), list_for_each_entry_rcu(), 250 rcu_dereference(), list_for_each_entry_rcu(),
217 list_for_each_continue_rcu(), and list_for_each_safe_rcu(), 251 list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
@@ -219,17 +253,21 @@ over a rather long period of time, but improvements are always welcome!
219 must be protected by appropriate update-side locks. RCU 253 must be protected by appropriate update-side locks. RCU
220 read-side critical sections are delimited by rcu_read_lock() 254 read-side critical sections are delimited by rcu_read_lock()
221 and rcu_read_unlock(), or by similar primitives such as 255 and rcu_read_unlock(), or by similar primitives such as
222 rcu_read_lock_bh() and rcu_read_unlock_bh(). 256 rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case
257 the matching rcu_dereference() primitive must be used in order
258 to keep lockdep happy, in this case, rcu_dereference_bh().
223 259
224 The reason that it is permissible to use RCU list-traversal 260 The reason that it is permissible to use RCU list-traversal
225 primitives when the update-side lock is held is that doing so 261 primitives when the update-side lock is held is that doing so
226 can be quite helpful in reducing code bloat when common code is 262 can be quite helpful in reducing code bloat when common code is
227 shared between readers and updaters. 263 shared between readers and updaters. Additional primitives
264 are provided for this case, as discussed in lockdep.txt.
228 265
22910. Conversely, if you are in an RCU read-side critical section, 26610. Conversely, if you are in an RCU read-side critical section,
230 and you don't hold the appropriate update-side lock, you -must- 267 and you don't hold the appropriate update-side lock, you -must-
231 use the "_rcu()" variants of the list macros. Failing to do so 268 use the "_rcu()" variants of the list macros. Failing to do so
232 will break Alpha and confuse people reading your code. 269 will break Alpha, cause aggressive compilers to generate bad code,
270 and confuse people trying to read your code.
233 271
23411. Note that synchronize_rcu() -only- guarantees to wait until 27211. Note that synchronize_rcu() -only- guarantees to wait until
235 all currently executing rcu_read_lock()-protected RCU read-side 273 all currently executing rcu_read_lock()-protected RCU read-side
@@ -239,15 +277,21 @@ over a rather long period of time, but improvements are always welcome!
239 rcu_read_lock()-protected read-side critical sections, do -not- 277 rcu_read_lock()-protected read-side critical sections, do -not-
240 use synchronize_rcu(). 278 use synchronize_rcu().
241 279
242 If you want to wait for some of these other things, you might 280 Similarly, disabling preemption is not an acceptable substitute
243 instead need to use synchronize_irq() or synchronize_sched(). 281 for rcu_read_lock(). Code that attempts to use preemption
282 disabling where it should be using rcu_read_lock() will break
283 in real-time kernel builds.
284
285 If you want to wait for interrupt handlers, NMI handlers, and
286 code under the influence of preempt_disable(), you instead
287 need to use synchronize_irq() or synchronize_sched().
244 288
24512. Any lock acquired by an RCU callback must be acquired elsewhere 28912. Any lock acquired by an RCU callback must be acquired elsewhere
246 with softirq disabled, e.g., via spin_lock_irqsave(), 290 with softirq disabled, e.g., via spin_lock_irqsave(),
247 spin_lock_bh(), etc. Failing to disable irq on a given 291 spin_lock_bh(), etc. Failing to disable irq on a given
248 acquisition of that lock will result in deadlock as soon as the 292 acquisition of that lock will result in deadlock as soon as
249 RCU callback happens to interrupt that acquisition's critical 293 the RCU softirq handler happens to run your RCU callback while
250 section. 294 interrupting that acquisition's critical section.
251 295
25213. RCU callbacks can be and are executed in parallel. In many cases, 29613. RCU callbacks can be and are executed in parallel. In many cases,
253 the callback code simply wrappers around kfree(), so that this 297 the callback code simply wrappers around kfree(), so that this
@@ -265,29 +309,30 @@ over a rather long period of time, but improvements are always welcome!
265 not the case, a self-spawning RCU callback would prevent the 309 not the case, a self-spawning RCU callback would prevent the
266 victim CPU from ever going offline.) 310 victim CPU from ever going offline.)
267 311
26814. SRCU (srcu_read_lock(), srcu_read_unlock(), and synchronize_srcu()) 31214. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
269 may only be invoked from process context. Unlike other forms of 313 synchronize_srcu(), and synchronize_srcu_expedited()) may only
270 RCU, it -is- permissible to block in an SRCU read-side critical 314 be invoked from process context. Unlike other forms of RCU, it
271 section (demarked by srcu_read_lock() and srcu_read_unlock()), 315 -is- permissible to block in an SRCU read-side critical section
272 hence the "SRCU": "sleepable RCU". Please note that if you 316 (demarked by srcu_read_lock() and srcu_read_unlock()), hence the
273 don't need to sleep in read-side critical sections, you should 317 "SRCU": "sleepable RCU". Please note that if you don't need
274 be using RCU rather than SRCU, because RCU is almost always 318 to sleep in read-side critical sections, you should be using
275 faster and easier to use than is SRCU. 319 RCU rather than SRCU, because RCU is almost always faster and
320 easier to use than is SRCU.
276 321
277 Also unlike other forms of RCU, explicit initialization 322 Also unlike other forms of RCU, explicit initialization
278 and cleanup is required via init_srcu_struct() and 323 and cleanup is required via init_srcu_struct() and
279 cleanup_srcu_struct(). These are passed a "struct srcu_struct" 324 cleanup_srcu_struct(). These are passed a "struct srcu_struct"
280 that defines the scope of a given SRCU domain. Once initialized, 325 that defines the scope of a given SRCU domain. Once initialized,
281 the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock() 326 the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
282 and synchronize_srcu(). A given synchronize_srcu() waits only 327 synchronize_srcu(), and synchronize_srcu_expedited(). A given
283 for SRCU read-side critical sections governed by srcu_read_lock() 328 synchronize_srcu() waits only for SRCU read-side critical
284 and srcu_read_unlock() calls that have been passd the same 329 sections governed by srcu_read_lock() and srcu_read_unlock()
285 srcu_struct. This property is what makes sleeping read-side 330 calls that have been passed the same srcu_struct. This property
286 critical sections tolerable -- a given subsystem delays only 331 is what makes sleeping read-side critical sections tolerable --
287 its own updates, not those of other subsystems using SRCU. 332 a given subsystem delays only its own updates, not those of other
288 Therefore, SRCU is less prone to OOM the system than RCU would 333 subsystems using SRCU. Therefore, SRCU is less prone to OOM the
289 be if RCU's read-side critical sections were permitted to 334 system than RCU would be if RCU's read-side critical sections
290 sleep. 335 were permitted to sleep.
291 336
292 The ability to sleep in read-side critical sections does not 337 The ability to sleep in read-side critical sections does not
293 come for free. First, corresponding srcu_read_lock() and 338 come for free. First, corresponding srcu_read_lock() and
@@ -300,8 +345,8 @@ over a rather long period of time, but improvements are always welcome!
300 requiring SRCU's read-side deadlock immunity or low read-side 345 requiring SRCU's read-side deadlock immunity or low read-side
301 realtime latency. 346 realtime latency.
302 347
303 Note that, rcu_assign_pointer() and rcu_dereference() relate to 348 Note that, rcu_assign_pointer() relates to SRCU just as they do
304 SRCU just as they do to other forms of RCU. 349 to other forms of RCU.
305 350
30615. The whole point of call_rcu(), synchronize_rcu(), and friends 35115. The whole point of call_rcu(), synchronize_rcu(), and friends
307 is to wait until all pre-existing readers have finished before 352 is to wait until all pre-existing readers have finished before
@@ -311,12 +356,12 @@ over a rather long period of time, but improvements are always welcome!
311 destructive operation, and -only- -then- invoke call_rcu(), 356 destructive operation, and -only- -then- invoke call_rcu(),
312 synchronize_rcu(), or friends. 357 synchronize_rcu(), or friends.
313 358
314 Because these primitives only wait for pre-existing readers, 359 Because these primitives only wait for pre-existing readers, it
315 it is the caller's responsibility to guarantee safety to 360 is the caller's responsibility to guarantee that any subsequent
316 any subsequent readers. 361 readers will execute safely.
317 362
31816. The various RCU read-side primitives do -not- contain memory 36316. The various RCU read-side primitives do -not- necessarily contain
319 barriers. The CPU (and in some cases, the compiler) is free 364 memory barriers. You should therefore plan for the CPU
320 to reorder code into and out of RCU read-side critical sections. 365 and the compiler to freely reorder code into and out of RCU
321 It is the responsibility of the RCU update-side primitives to 366 read-side critical sections. It is the responsibility of the
322 deal with this. 367 RCU update-side primitives to deal with this.
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
new file mode 100644
index 000000000000..d7a49b2f6994
--- /dev/null
+++ b/Documentation/RCU/lockdep.txt
@@ -0,0 +1,91 @@
1RCU and lockdep checking
2
3All flavors of RCU have lockdep checking available, so that lockdep is
4aware of when each task enters and leaves any flavor of RCU read-side
5critical section. Each flavor of RCU is tracked separately (but note
6that this is not the case in 2.6.32 and earlier). This allows lockdep's
7tracking to include RCU state, which can sometimes help when debugging
8deadlocks and the like.
9
10In addition, RCU provides the following primitives that check lockdep's
11state:
12
13 rcu_read_lock_held() for normal RCU.
14 rcu_read_lock_bh_held() for RCU-bh.
15 rcu_read_lock_sched_held() for RCU-sched.
16 srcu_read_lock_held() for SRCU.
17
18These functions are conservative, and will therefore return 1 if they
19aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set).
20This prevents things like WARN_ON(!rcu_read_lock_held()) from giving false
21positives when lockdep is disabled.
22
23In addition, a separate kernel config parameter CONFIG_PROVE_RCU enables
24checking of rcu_dereference() primitives:
25
26 rcu_dereference(p):
27 Check for RCU read-side critical section.
28 rcu_dereference_bh(p):
29 Check for RCU-bh read-side critical section.
30 rcu_dereference_sched(p):
31 Check for RCU-sched read-side critical section.
32 srcu_dereference(p, sp):
33 Check for SRCU read-side critical section.
34 rcu_dereference_check(p, c):
35 Use explicit check expression "c". This is useful in
36 code that is invoked by both readers and updaters.
37 rcu_dereference_raw(p)
38 Don't check. (Use sparingly, if at all.)
39 rcu_dereference_protected(p, c):
40 Use explicit check expression "c", and omit all barriers
41 and compiler constraints. This is useful when the data
42 structure cannot change, for example, in code that is
43 invoked only by updaters.
44 rcu_access_pointer(p):
45 Return the value of the pointer and omit all barriers,
46 but retain the compiler constraints that prevent duplicating
47 or coalescsing. This is useful when when testing the
48 value of the pointer itself, for example, against NULL.
49
50The rcu_dereference_check() check expression can be any boolean
51expression, but would normally include one of the rcu_read_lock_held()
52family of functions and a lockdep expression. However, any boolean
53expression can be used. For a moderately ornate example, consider
54the following:
55
56 file = rcu_dereference_check(fdt->fd[fd],
57 rcu_read_lock_held() ||
58 lockdep_is_held(&files->file_lock) ||
59 atomic_read(&files->count) == 1);
60
61This expression picks up the pointer "fdt->fd[fd]" in an RCU-safe manner,
62and, if CONFIG_PROVE_RCU is configured, verifies that this expression
63is used in:
64
651. An RCU read-side critical section, or
662. with files->file_lock held, or
673. on an unshared files_struct.
68
69In case (1), the pointer is picked up in an RCU-safe manner for vanilla
70RCU read-side critical sections, in case (2) the ->file_lock prevents
71any change from taking place, and finally, in case (3) the current task
72is the only task accessing the file_struct, again preventing any change
73from taking place. If the above statement was invoked only from updater
74code, it could instead be written as follows:
75
76 file = rcu_dereference_protected(fdt->fd[fd],
77 lockdep_is_held(&files->file_lock) ||
78 atomic_read(&files->count) == 1);
79
80This would verify cases #2 and #3 above, and furthermore lockdep would
81complain if this was used in an RCU read-side critical section unless one
82of these two cases held. Because rcu_dereference_protected() omits all
83barriers and compiler constraints, it generates better code than do the
84other flavors of rcu_dereference(). On the other hand, it is illegal
85to use rcu_dereference_protected() if either the RCU-protected pointer
86or the RCU-protected data that it points to can change concurrently.
87
88There are currently only "universal" versions of the rcu_assign_pointer()
89and RCU list-/tree-traversal primitives, which do not (yet) check for
90being in an RCU read-side critical section. In the future, separate
91versions of these primitives might be created.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 2a23523ce471..31852705b586 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -75,6 +75,8 @@ o I hear that RCU is patented? What is with that?
75 search for the string "Patent" in RTFP.txt to find them. 75 search for the string "Patent" in RTFP.txt to find them.
76 Of these, one was allowed to lapse by the assignee, and the 76 Of these, one was allowed to lapse by the assignee, and the
77 others have been contributed to the Linux kernel under GPL. 77 others have been contributed to the Linux kernel under GPL.
78 There are now also LGPL implementations of user-level RCU
79 available (http://lttng.org/?q=node/18).
78 80
79o I hear that RCU needs work in order to support realtime kernels? 81o I hear that RCU needs work in order to support realtime kernels?
80 82
@@ -91,48 +93,4 @@ o Where can I find more information on RCU?
91 93
92o What are all these files in this directory? 94o What are all these files in this directory?
93 95
94 96 See 00-INDEX for the list.
95 NMI-RCU.txt
96
97 Describes how to use RCU to implement dynamic
98 NMI handlers, which can be revectored on the fly,
99 without rebooting.
100
101 RTFP.txt
102
103 List of RCU-related publications and web sites.
104
105 UP.txt
106
107 Discussion of RCU usage in UP kernels.
108
109 arrayRCU.txt
110
111 Describes how to use RCU to protect arrays, with
112 resizeable arrays whose elements reference other
113 data structures being of the most interest.
114
115 checklist.txt
116
117 Lists things to check for when inspecting code that
118 uses RCU.
119
120 listRCU.txt
121
122 Describes how to use RCU to protect linked lists.
123 This is the simplest and most common use of RCU
124 in the Linux kernel.
125
126 rcu.txt
127
128 You are reading it!
129
130 rcuref.txt
131
132 Describes how to combine use of reference counts
133 with RCU.
134
135 whatisRCU.txt
136
137 Overview of how the RCU implementation works. Along
138 the way, presents a conceptual view of RCU.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
new file mode 100644
index 000000000000..44c6dcc93d6d
--- /dev/null
+++ b/Documentation/RCU/stallwarn.txt
@@ -0,0 +1,106 @@
1Using RCU's CPU Stall Detector
2
3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
4RCU's CPU stall detector, which detects conditions that unduly delay
5RCU grace periods. The stall detector's idea of what constitutes
6"unduly delayed" is controlled by a set of C preprocessor macros:
7
8RCU_SECONDS_TILL_STALL_CHECK
9
10 This macro defines the period of time that RCU will wait from
11 the beginning of a grace period until it issues an RCU CPU
12 stall warning. This time period is normally ten seconds.
13
14RCU_SECONDS_TILL_STALL_RECHECK
15
16 This macro defines the period of time that RCU will wait after
17 issuing a stall warning until it issues another stall warning
18 for the same stall. This time period is normally set to thirty
19 seconds.
20
21RCU_STALL_RAT_DELAY
22
23 The CPU stall detector tries to make the offending CPU print its
24 own warnings, as this often gives better-quality stack traces.
25 However, if the offending CPU does not detect its own stall in
26 the number of jiffies specified by RCU_STALL_RAT_DELAY, then
27 some other CPU will complain. This delay is normally set to
28 two jiffies.
29
30When a CPU detects that it is stalling, it will print a message similar
31to the following:
32
33INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies)
34
35This message indicates that CPU 5 detected that it was causing a stall,
36and that the stall was affecting RCU-sched. This message will normally be
37followed by a stack dump of the offending CPU. On TREE_RCU kernel builds,
38RCU and RCU-sched are implemented by the same underlying mechanism,
39while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented
40by rcu_preempt_state.
41
42On the other hand, if the offending CPU fails to print out a stall-warning
43message quickly enough, some other CPU will print a message similar to
44the following:
45
46INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies)
47
48This message indicates that CPU 2 detected that CPUs 3 and 5 were both
49causing stalls, and that the stall was affecting RCU-bh. This message
50will normally be followed by stack dumps for each CPU. Please note that
51TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs,
52and that the tasks will be indicated by PID, for example, "P3421".
53It is even possible for a rcu_preempt_state stall to be caused by both
54CPUs -and- tasks, in which case the offending CPUs and tasks will all
55be called out in the list.
56
57Finally, if the grace period ends just as the stall warning starts
58printing, there will be a spurious stall-warning message:
59
60INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
61
62This is rare, but does happen from time to time in real life.
63
64So your kernel printed an RCU CPU stall warning. The next question is
65"What caused it?" The following problems can result in RCU CPU stall
66warnings:
67
68o A CPU looping in an RCU read-side critical section.
69
70o A CPU looping with interrupts disabled. This condition can
71 result in RCU-sched and RCU-bh stalls.
72
73o A CPU looping with preemption disabled. This condition can
74 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
75 stalls.
76
77o A CPU looping with bottom halves disabled. This condition can
78 result in RCU-sched and RCU-bh stalls.
79
80o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
81 without invoking schedule().
82
83o A bug in the RCU implementation.
84
85o A hardware failure. This is quite unlikely, but has occurred
86 at least once in real life. A CPU failed in a running system,
87 becoming unresponsive, but not causing an immediate crash.
88 This resulted in a series of RCU CPU stall warnings, eventually
89 leading the realization that the CPU had failed.
90
91The RCU, RCU-sched, and RCU-bh implementations have CPU stall
92warning. SRCU does not have its own CPU stall warnings, but its
93calls to synchronize_sched() will result in RCU-sched detecting
94RCU-sched-related CPU stalls. Please note that RCU only detects
95CPU stalls when there is a grace period in progress. No grace period,
96no CPU stall warnings.
97
98To diagnose the cause of the stall, inspect the stack traces.
99The offending function will usually be near the top of the stack.
100If you have a series of stall warnings from a single extended stall,
101comparing the stack traces can often help determine where the stall
102is occurring, which will usually be in the function nearest the top of
103that portion of the stack which remains the same from trace to trace.
104If you can reliably trigger the stall, ftrace can be quite helpful.
105
106RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 9dba3bb90e60..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -30,6 +30,18 @@ MODULE PARAMETERS
30 30
31This module has the following parameters: 31This module has the following parameters:
32 32
33fqs_duration Duration (in microseconds) of artificially induced bursts
34 of force_quiescent_state() invocations. In RCU
35 implementations having force_quiescent_state(), these
36 bursts help force races between forcing a given grace
37 period and that grace period ending on its own.
38
39fqs_holdoff Holdoff time (in microseconds) between consecutive calls
40 to force_quiescent_state() within a burst.
41
42fqs_stutter Wait time (in seconds) between consecutive bursts
43 of calls to force_quiescent_state().
44
33irqreaders Says to invoke RCU readers from irq level. This is currently 45irqreaders Says to invoke RCU readers from irq level. This is currently
34 done via timers. Defaults to "1" for variants of RCU that 46 done via timers. Defaults to "1" for variants of RCU that
35 permit this. (Or, more accurately, variants of RCU that do 47 permit this. (Or, more accurately, variants of RCU that do
@@ -170,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
170 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
171 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
172 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
173 state: -1 / 0:0 3:0 4:0
174
175As before, the first four lines are similar to those for RCU.
176The last line shows the task-migration state. The first number is
177-1 if synchronize_sched_expedited() is idle, -2 if in the process of
178posting wakeups to the migration kthreads, and N when waiting on CPU N.
179Each of the colon-separated fields following the "/" is a CPU:state pair.
180Valid states are "0" for idle, "1" for waiting for quiescent state,
181"2" for passed through quiescent state, and "3" when a race with a
182CPU-hotplug event forces use of the synchronize_sched() primitive.
183 185
184 186
185USAGE 187USAGE
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8608fd85e921..efd8cc95c06b 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
256The output of "cat rcu/rcu_pending" looks as follows: 256The output of "cat rcu/rcu_pending" looks as follows:
257 257
258rcu_sched: 258rcu_sched:
259 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 259 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
260 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 260 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
261 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 261 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
262 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 262 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
263 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 263 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
264 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 264 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
265 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 265 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
266 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 266 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
267rcu_bh: 267rcu_bh:
268 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 268 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
269 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 269 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
270 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 270 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
271 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 271 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
272 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 272 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
273 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 273 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
274 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 274 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
275 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 275 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
276 276
277As always, this is once again split into "rcu_sched" and "rcu_bh" 277As always, this is once again split into "rcu_sched" and "rcu_bh"
278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional 278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked
284o "qsp" is the number of times that the RCU was waiting for a 284o "qsp" is the number of times that the RCU was waiting for a
285 quiescent state from this CPU. 285 quiescent state from this CPU.
286 286
287o "rpq" is the number of times that the CPU had passed through
288 a quiescent state, but not yet reported it to RCU.
289
287o "cbr" is the number of times that this CPU had RCU callbacks 290o "cbr" is the number of times that this CPU had RCU callbacks
288 that had passed through a grace period, and were thus ready 291 that had passed through a grace period, and were thus ready
289 to be invoked. 292 to be invoked.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index d542ca243b80..cfaac34c4557 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -323,14 +323,17 @@ used as follows:
323 Defer Protect 323 Defer Protect
324 324
325a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock() 325a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
326 call_rcu() 326 call_rcu() rcu_dereference()
327 327
328b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh() 328b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
329 rcu_dereference_bh()
329 330
330c. synchronize_sched() preempt_disable() / preempt_enable() 331c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
332 preempt_disable() / preempt_enable()
331 local_irq_save() / local_irq_restore() 333 local_irq_save() / local_irq_restore()
332 hardirq enter / hardirq exit 334 hardirq enter / hardirq exit
333 NMI enter / NMI exit 335 NMI enter / NMI exit
336 rcu_dereference_sched()
334 337
335These three mechanisms are used as follows: 338These three mechanisms are used as follows:
336 339
@@ -780,9 +783,8 @@ Linux-kernel source code, but it helps to have a full list of the
780APIs, since there does not appear to be a way to categorize them 783APIs, since there does not appear to be a way to categorize them
781in docbook. Here is the list, by category. 784in docbook. Here is the list, by category.
782 785
783RCU pointer/list traversal: 786RCU list traversal:
784 787
785 rcu_dereference
786 list_for_each_entry_rcu 788 list_for_each_entry_rcu
787 hlist_for_each_entry_rcu 789 hlist_for_each_entry_rcu
788 hlist_nulls_for_each_entry_rcu 790 hlist_nulls_for_each_entry_rcu
@@ -808,7 +810,7 @@ RCU: Critical sections Grace period Barrier
808 810
809 rcu_read_lock synchronize_net rcu_barrier 811 rcu_read_lock synchronize_net rcu_barrier
810 rcu_read_unlock synchronize_rcu 812 rcu_read_unlock synchronize_rcu
811 synchronize_rcu_expedited 813 rcu_dereference synchronize_rcu_expedited
812 call_rcu 814 call_rcu
813 815
814 816
@@ -816,7 +818,7 @@ bh: Critical sections Grace period Barrier
816 818
817 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh 819 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
818 rcu_read_unlock_bh synchronize_rcu_bh 820 rcu_read_unlock_bh synchronize_rcu_bh
819 synchronize_rcu_bh_expedited 821 rcu_dereference_bh synchronize_rcu_bh_expedited
820 822
821 823
822sched: Critical sections Grace period Barrier 824sched: Critical sections Grace period Barrier
@@ -825,17 +827,25 @@ sched: Critical sections Grace period Barrier
825 rcu_read_unlock_sched call_rcu_sched 827 rcu_read_unlock_sched call_rcu_sched
826 [preempt_disable] synchronize_sched_expedited 828 [preempt_disable] synchronize_sched_expedited
827 [and friends] 829 [and friends]
830 rcu_dereference_sched
828 831
829 832
830SRCU: Critical sections Grace period Barrier 833SRCU: Critical sections Grace period Barrier
831 834
832 srcu_read_lock synchronize_srcu N/A 835 srcu_read_lock synchronize_srcu N/A
833 srcu_read_unlock synchronize_srcu_expedited 836 srcu_read_unlock synchronize_srcu_expedited
837 srcu_dereference
834 838
835SRCU: Initialization/cleanup 839SRCU: Initialization/cleanup
836 init_srcu_struct 840 init_srcu_struct
837 cleanup_srcu_struct 841 cleanup_srcu_struct
838 842
843All: lockdep-checked RCU-protected pointer access
844
845 rcu_dereference_check
846 rcu_dereference_protected
847 rcu_access_pointer
848
839See the comment headers in the source code (or the docbook generated 849See the comment headers in the source code (or the docbook generated
840from them) for more information. 850from them) for more information.
841 851
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 34614b4c708e..e9dab41c0fe0 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -73,7 +73,7 @@ NOTE: Smack labels are limited to 23 characters. The attr command
73If you don't do anything special all users will get the floor ("_") 73If you don't do anything special all users will get the floor ("_")
74label when they log in. If you do want to log in via the hacked ssh 74label when they log in. If you do want to log in via the hacked ssh
75at other labels use the attr command to set the smack value on the 75at other labels use the attr command to set the smack value on the
76home directory and it's contents. 76home directory and its contents.
77 77
78You can add access rules in /etc/smack/accesses. They take the form: 78You can add access rules in /etc/smack/accesses. They take the form:
79 79
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 1053a56be3b1..da0382daa395 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -9,10 +9,16 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux
9kernel patches. 9kernel patches.
10 10
11 11
121: Builds cleanly with applicable or modified CONFIG options =y, =m, and 121: If you use a facility then #include the file that defines/declares
13 that facility. Don't depend on other header files pulling in ones
14 that you use.
15
162: Builds cleanly with applicable or modified CONFIG options =y, =m, and
13 =n. No gcc warnings/errors, no linker warnings/errors. 17 =n. No gcc warnings/errors, no linker warnings/errors.
14 18
152: Passes allnoconfig, allmodconfig 192b: Passes allnoconfig, allmodconfig
20
212c: Builds successfully when using O=builddir
16 22
173: Builds on multiple CPU architectures by using local cross-compile tools 233: Builds on multiple CPU architectures by using local cross-compile tools
18 or some other build farm. 24 or some other build farm.
@@ -91,3 +97,13 @@ kernel patches.
91 97
9225: If any ioctl's are added by the patch, then also update 9825: If any ioctl's are added by the patch, then also update
93 Documentation/ioctl/ioctl-number.txt. 99 Documentation/ioctl/ioctl-number.txt.
100
10126: If your modified source code depends on or uses any of the kernel
102 APIs or features that are related to the following kconfig symbols,
103 then test multiple builds with the related kconfig symbols disabled
104 and/or =m (if that option is available) [not all of these at the
105 same time, just various/random combinations of them]:
106
107 CONFIG_SMP, CONFIG_SYSFS, CONFIG_PROC_FS, CONFIG_INPUT, CONFIG_PCI,
108 CONFIG_BLOCK, CONFIG_PM, CONFIG_HOTPLUG, CONFIG_MAGIC_SYSRQ,
109 CONFIG_NET, CONFIG_INET=n (but latter with CONFIG_NET=y)
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index 99e72a81fa2f..4947fd8fb182 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -130,6 +130,8 @@ Linux kernel master tree:
130 ftp.??.kernel.org:/pub/linux/kernel/... 130 ftp.??.kernel.org:/pub/linux/kernel/...
131 ?? == your country code, such as "us", "uk", "fr", etc. 131 ?? == your country code, such as "us", "uk", "fr", etc.
132 132
133 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git
134
133Linux kernel mailing list: 135Linux kernel mailing list:
134 linux-kernel@vger.kernel.org 136 linux-kernel@vger.kernel.org
135 [mail majordomo@vger.kernel.org to subscribe] 137 [mail majordomo@vger.kernel.org to subscribe]
@@ -160,3 +162,6 @@ How to NOT write kernel driver by Arjan van de Ven:
160 162
161Kernel Janitor: 163Kernel Janitor:
162 http://janitor.kernelnewbies.org/ 164 http://janitor.kernelnewbies.org/
165
166GIT, Fast Version Control System:
167 http://git-scm.com/
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index 82e418d648d0..7f5fc3ba9c91 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -20,6 +20,8 @@ Samsung-S3C24XX
20 - S3C24XX ARM Linux Overview 20 - S3C24XX ARM Linux Overview
21Sharp-LH 21Sharp-LH
22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) 22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
23SPEAr
24 - ST SPEAr platform Linux Overview
23VFP/ 25VFP/
24 - Release notes for Linux Kernel Vector Floating Point support code 26 - Release notes for Linux Kernel Vector Floating Point support code
25empeg/ 27empeg/
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
index 7197a9e958ee..f9f62e8c0719 100644
--- a/Documentation/arm/SA1100/ADSBitsy
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -32,7 +32,7 @@ Notes:
32 32
33- The flash on board is divided into 3 partitions. 33- The flash on board is divided into 3 partitions.
34 You should be careful to use flash on board. 34 You should be careful to use flash on board.
35 It's partition is different from GraphicsClient Plus and GraphicsMaster 35 Its partition is different from GraphicsClient Plus and GraphicsMaster
36 36
37- 16bpp mode requires a different cable than what ships with the board. 37- 16bpp mode requires a different cable than what ships with the board.
38 Contact ADS or look through the manual to wire your own. Currently, 38 Contact ADS or look through the manual to wire your own. Currently,
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
new file mode 100644
index 000000000000..253a35c6f782
--- /dev/null
+++ b/Documentation/arm/SPEAr/overview.txt
@@ -0,0 +1,60 @@
1 SPEAr ARM Linux Overview
2 ==========================
3
4Introduction
5------------
6
7 SPEAr (Structured Processor Enhanced Architecture).
8 weblink : http://www.st.com/spear
9
10 The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
11 supported by the 'spear' platform of ARM Linux. Currently SPEAr300,
12 SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX
13 series is in progress.
14
15 Hierarchy in SPEAr is as follows:
16
17 SPEAr (Platform)
18 - SPEAr3XX (3XX SOC series, based on ARM9)
19 - SPEAr300 (SOC)
20 - SPEAr300_EVB (Evaluation Board)
21 - SPEAr310 (SOC)
22 - SPEAr310_EVB (Evaluation Board)
23 - SPEAr320 (SOC)
24 - SPEAr320_EVB (Evaluation Board)
25 - SPEAr6XX (6XX SOC series, based on ARM9)
26 - SPEAr600 (SOC)
27 - SPEAr600_EVB (Evaluation Board)
28 - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
29 - SPEAr1300 (SOC)
30
31 Configuration
32 -------------
33
34 A generic configuration is provided for each machine, and can be used as the
35 default by
36 make spear600_defconfig
37 make spear300_defconfig
38 make spear310_defconfig
39 make spear320_defconfig
40
41 Layout
42 ------
43
44 The common files for multiple machine families (SPEAr3XX, SPEAr6XX and
45 SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear
46 with headers in plat/.
47
48 Each machine series have a directory with name arch/arm/mach-spear followed by
49 series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
50
51 Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for
52 spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine
53 specific files, like spear300.c, spear310.c, spear320.c and spear600.c.
54 mach-spear* also contains board specific files for each machine type.
55
56
57 Document Author
58 ---------------
59
60 Viresh Kumar, (c) 2010 ST Microelectronics
diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
index 76b3a11e90be..fa968aa99d67 100644
--- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
+++ b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
@@ -14,8 +14,8 @@ Introduction
14 how the clocks are arranged. The first implementation used as single 14 how the clocks are arranged. The first implementation used as single
15 PLL to feed the ARM, memory and peripherals via a series of dividers 15 PLL to feed the ARM, memory and peripherals via a series of dividers
16 and muxes and this is the implementation that is documented here. A 16 and muxes and this is the implementation that is documented here. A
17 newer version where there is a seperate PLL and clock divider for the 17 newer version where there is a separate PLL and clock divider for the
18 ARM core is available as a seperate driver. 18 ARM core is available as a separate driver.
19 19
20 20
21Layout 21Layout
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt
new file mode 100644
index 000000000000..7cced1fea9c3
--- /dev/null
+++ b/Documentation/arm/Samsung/Overview.txt
@@ -0,0 +1,86 @@
1 Samsung ARM Linux Overview
2 ==========================
3
4Introduction
5------------
6
7 The Samsung range of ARM SoCs spans many similar devices, from the initial
8 ARM9 through to the newest ARM cores. This document shows an overview of
9 the current kernel support, how to use it and where to find the code
10 that supports this.
11
12 The currently supported SoCs are:
13
14 - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
15 - S3C64XX: S3C6400 and S3C6410
16 - S5PC6440
17
18 S5PC100 and S5PC110 support is currently being merged
19
20
21S3C24XX Systems
22---------------
23
24 There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
25 deals with the architecture and drivers specific to these devices.
26
27 See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
28 on the implementation details and specific support.
29
30
31Configuration
32-------------
33
34 A number of configurations are supplied, as there is no current way of
35 unifying all the SoCs into one kernel.
36
37 s5p6440_defconfig - S5P6440 specific default configuration
38 s5pc100_defconfig - S5PC100 specific default configuration
39
40
41Layout
42------
43
44 The directory layout is currently being restructured, and consists of
45 several platform directories and then the machine specific directories
46 of the CPUs being built for.
47
48 plat-samsung provides the base for all the implementations, and is the
49 last in the line of include directories that are processed for the build
50 specific information. It contains the base clock, GPIO and device definitions
51 to get the system running.
52
53 plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently
54 involved in other builds this will be phased out once the relevant code is
55 moved elsewhere.
56
57 plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
58
59 plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs.
60
61 plat-s5p is for s5p specific builds, more to be added.
62
63
64 [ to finish ]
65
66
67Port Contributors
68-----------------
69
70 Ben Dooks (BJD)
71 Vincent Sanders
72 Herbert Potzl
73 Arnaud Patard (RTP)
74 Roc Wu
75 Klaus Fetscher
76 Dimitry Andric
77 Shannon Holland
78 Guillaume Gourat (NexVision)
79 Christer Weinigel (wingel) (Acer N30)
80 Lucas Correia Villa Real (S3C2400 port)
81
82
83Document Author
84---------------
85
86Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
diff --git a/Documentation/arm/Samsung/clksrc-change-registers.awk b/Documentation/arm/Samsung/clksrc-change-registers.awk
new file mode 100755
index 000000000000..0c50220851fb
--- /dev/null
+++ b/Documentation/arm/Samsung/clksrc-change-registers.awk
@@ -0,0 +1,167 @@
1#!/usr/bin/awk -f
2#
3# Copyright 2010 Ben Dooks <ben-linux@fluff.org>
4#
5# Released under GPLv2
6
7# example usage
8# ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst
9
10function extract_value(s)
11{
12 eqat = index(s, "=")
13 comat = index(s, ",")
14 return substr(s, eqat+2, (comat-eqat)-2)
15}
16
17function remove_brackets(b)
18{
19 return substr(b, 2, length(b)-2)
20}
21
22function splitdefine(l, p)
23{
24 r = split(l, tp)
25
26 p[0] = tp[2]
27 p[1] = remove_brackets(tp[3])
28}
29
30function find_length(f)
31{
32 if (0)
33 printf "find_length " f "\n" > "/dev/stderr"
34
35 if (f ~ /0x1/)
36 return 1
37 else if (f ~ /0x3/)
38 return 2
39 else if (f ~ /0x7/)
40 return 3
41 else if (f ~ /0xf/)
42 return 4
43
44 printf "unknown legnth " f "\n" > "/dev/stderr"
45 exit
46}
47
48function find_shift(s)
49{
50 id = index(s, "<")
51 if (id <= 0) {
52 printf "cannot find shift " s "\n" > "/dev/stderr"
53 exit
54 }
55
56 return substr(s, id+2)
57}
58
59
60BEGIN {
61 if (ARGC < 2) {
62 print "too few arguments" > "/dev/stderr"
63 exit
64 }
65
66# read the header file and find the mask values that we will need
67# to replace and create an associative array of values
68
69 while (getline line < ARGV[1] > 0) {
70 if (line ~ /\#define.*_MASK/ &&
71 !(line ~ /S5PC100_EPLL_MASK/) &&
72 !(line ~ /USB_SIG_MASK/)) {
73 splitdefine(line, fields)
74 name = fields[0]
75 if (0)
76 printf "MASK " line "\n" > "/dev/stderr"
77 dmask[name,0] = find_length(fields[1])
78 dmask[name,1] = find_shift(fields[1])
79 if (0)
80 printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr"
81 } else {
82 }
83 }
84
85 delete ARGV[1]
86}
87
88/clksrc_clk.*=.*{/ {
89 shift=""
90 mask=""
91 divshift=""
92 reg_div=""
93 reg_src=""
94 indent=1
95
96 print $0
97
98 for(; indent >= 1;) {
99 if ((getline line) <= 0) {
100 printf "unexpected end of file" > "/dev/stderr"
101 exit 1;
102 }
103
104 if (line ~ /\.shift/) {
105 shift = extract_value(line)
106 } else if (line ~ /\.mask/) {
107 mask = extract_value(line)
108 } else if (line ~ /\.reg_divider/) {
109 reg_div = extract_value(line)
110 } else if (line ~ /\.reg_source/) {
111 reg_src = extract_value(line)
112 } else if (line ~ /\.divider_shift/) {
113 divshift = extract_value(line)
114 } else if (line ~ /{/) {
115 indent++
116 print line
117 } else if (line ~ /}/) {
118 indent--
119
120 if (indent == 0) {
121 if (0) {
122 printf "shift '" shift "' ='" dmask[shift,0] "'\n" > "/dev/stderr"
123 printf "mask '" mask "'\n" > "/dev/stderr"
124 printf "dshft '" divshift "'\n" > "/dev/stderr"
125 printf "rdiv '" reg_div "'\n" > "/dev/stderr"
126 printf "rsrc '" reg_src "'\n" > "/dev/stderr"
127 }
128
129 generated = mask
130 sub(reg_src, reg_div, generated)
131
132 if (0) {
133 printf "/* rsrc " reg_src " */\n"
134 printf "/* rdiv " reg_div " */\n"
135 printf "/* shift " shift " */\n"
136 printf "/* mask " mask " */\n"
137 printf "/* generated " generated " */\n"
138 }
139
140 if (reg_div != "") {
141 printf "\t.reg_div = { "
142 printf ".reg = " reg_div ", "
143 printf ".shift = " dmask[generated,1] ", "
144 printf ".size = " dmask[generated,0] ", "
145 printf "},\n"
146 }
147
148 printf "\t.reg_src = { "
149 printf ".reg = " reg_src ", "
150 printf ".shift = " dmask[mask,1] ", "
151 printf ".size = " dmask[mask,0] ", "
152
153 printf "},\n"
154
155 }
156
157 print line
158 } else {
159 print line
160 }
161
162 if (0)
163 printf indent ":" line "\n" > "/dev/stderr"
164 }
165}
166
167// && ! /clksrc_clk.*=.*{/ { print $0 }
diff --git a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
index 1e6a23fdf2fc..dc460f055647 100644
--- a/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
+++ b/Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
@@ -7,7 +7,7 @@ The driver only implements a four-wire touch panel protocol.
7 7
8The touchscreen driver is maintenance free except for the pen-down or 8The touchscreen driver is maintenance free except for the pen-down or
9touch threshold. Some resistive displays and board combinations may 9touch threshold. Some resistive displays and board combinations may
10require tuning of this threshold. The driver exposes some of it's 10require tuning of this threshold. The driver exposes some of its
11internal state in the sys filesystem. If the kernel is configured 11internal state in the sys filesystem. If the kernel is configured
12with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a 12with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a
13directory 13directory
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index 9d58c7c5eddd..eb0fae18ffb1 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -59,7 +59,11 @@ PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region.
59 This maps the platforms RAM, and typically 59 This maps the platforms RAM, and typically
60 maps all platform RAM in a 1:1 relationship. 60 maps all platform RAM in a 1:1 relationship.
61 61
62TASK_SIZE PAGE_OFFSET-1 Kernel module space 62PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings
63 One way of mapping HIGHMEM pages into kernel
64 space.
65
66MODULES_VADDR MODULES_END-1 Kernel module space
63 Kernel modules inserted via insmod are 67 Kernel modules inserted via insmod are
64 placed here using dynamic mappings. 68 placed here using dynamic mappings.
65 69
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 396bec3b74ed..ac4d47187122 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -320,7 +320,7 @@ counter decrement would not become globally visible until the
320obj->active update does. 320obj->active update does.
321 321
322As a historical note, 32-bit Sparc used to only allow usage of 322As a historical note, 32-bit Sparc used to only allow usage of
32324-bits of it's atomic_t type. This was because it used 8 bits 32324-bits of its atomic_t type. This was because it used 8 bits
324as a spinlock for SMP safety. Sparc32 lacked a "compare and swap" 324as a spinlock for SMP safety. Sparc32 lacked a "compare and swap"
325type instruction. However, 32-bit Sparc has since been moved over 325type instruction. However, 32-bit Sparc has since been moved over
326to a "hash table of spinlocks" scheme, that allows the full 32-bit 326to a "hash table of spinlocks" scheme, that allows the full 32-bit
diff --git a/Documentation/blackfin/bfin-gpio-notes.txt b/Documentation/blackfin/bfin-gpio-notes.txt
index 9898c7ded7d3..f731c1e56475 100644
--- a/Documentation/blackfin/bfin-gpio-notes.txt
+++ b/Documentation/blackfin/bfin-gpio-notes.txt
@@ -43,7 +43,7 @@
43 void bfin_gpio_irq_free(unsigned gpio); 43 void bfin_gpio_irq_free(unsigned gpio);
44 44
45 The request functions will record the function state for a certain pin, 45 The request functions will record the function state for a certain pin,
46 the free functions will clear it's function state. 46 the free functions will clear its function state.
47 Once a pin is requested, it can't be requested again before it is freed by 47 Once a pin is requested, it can't be requested again before it is freed by
48 previous caller, otherwise kernel will dump stacks, and the request 48 previous caller, otherwise kernel will dump stacks, and the request
49 function fail. 49 function fail.
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6fab97ea7e6b..508b5b2b0289 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1162,8 +1162,8 @@ where a driver received a request ala this before:
1162 1162
1163As mentioned, there is no virtual mapping of a bio. For DMA, this is 1163As mentioned, there is no virtual mapping of a bio. For DMA, this is
1164not a problem as the driver probably never will need a virtual mapping. 1164not a problem as the driver probably never will need a virtual mapping.
1165Instead it needs a bus mapping (pci_map_page for a single segment or 1165Instead it needs a bus mapping (dma_map_page for a single segment or
1166use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For 1166use dma_map_sg for scatter gather) to be able to ship it to the driver. For
1167PIO drivers (or drivers that need to revert to PIO transfer once in a 1167PIO drivers (or drivers that need to revert to PIO transfer once in a
1168while (IDE for example)), where the CPU is doing the actual data 1168while (IDE for example)), where the CPU is doing the actual data
1169transfer a virtual mapping is needed. If the driver supports highmem I/O, 1169transfer a virtual mapping is needed. If the driver supports highmem I/O,
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index e164403f60e1..f65274081c8d 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -25,11 +25,11 @@ size allowed by the hardware.
25 25
26nomerges (RW) 26nomerges (RW)
27------------- 27-------------
28This enables the user to disable the lookup logic involved with IO merging 28This enables the user to disable the lookup logic involved with IO
29requests in the block layer. Merging may still occur through a direct 29merging requests in the block layer. By default (0) all merges are
301-hit cache, since that comes for (almost) free. The IO scheduler will not 30enabled. When set to 1 only simple one-hit merges will be tried. When
31waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults 31set to 2 no merge algorithms will be tried (including one-hit or more
32to 0, enabling all merges. 32complex tree/hash lookups).
33 33
34nr_requests (RW) 34nr_requests (RW)
35---------------- 35----------------
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index da42ab414c48..9164ae3b83bc 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -5,7 +5,7 @@
5 5
6This document describes the cache/tlb flushing interfaces called 6This document describes the cache/tlb flushing interfaces called
7by the Linux VM subsystem. It enumerates over each interface, 7by the Linux VM subsystem. It enumerates over each interface,
8describes it's intended purpose, and what side effect is expected 8describes its intended purpose, and what side effect is expected
9after the interface is invoked. 9after the interface is invoked.
10 10
11The side effects described below are stated for a uniprocessor 11The side effects described below are stated for a uniprocessor
@@ -88,12 +88,12 @@ changes occur:
88 This is used primarily during fault processing. 88 This is used primarily during fault processing.
89 89
905) void update_mmu_cache(struct vm_area_struct *vma, 905) void update_mmu_cache(struct vm_area_struct *vma,
91 unsigned long address, pte_t pte) 91 unsigned long address, pte_t *ptep)
92 92
93 At the end of every page fault, this routine is invoked to 93 At the end of every page fault, this routine is invoked to
94 tell the architecture specific code that a translation 94 tell the architecture specific code that a translation
95 described by "pte" now exists at virtual address "address" 95 now exists at virtual address "address" for address space
96 for address space "vma->vm_mm", in the software page tables. 96 "vma->vm_mm", in the software page tables.
97 97
98 A port may use this information in any way it so chooses. 98 A port may use this information in any way it so chooses.
99 For example, it could use this event to pre-load TLB 99 For example, it could use this event to pre-load TLB
@@ -231,7 +231,7 @@ require a whole different set of interfaces to handle properly.
231The biggest problem is that of virtual aliasing in the data cache 231The biggest problem is that of virtual aliasing in the data cache
232of a processor. 232of a processor.
233 233
234Is your port susceptible to virtual aliasing in it's D-cache? 234Is your port susceptible to virtual aliasing in its D-cache?
235Well, if your D-cache is virtually indexed, is larger in size than 235Well, if your D-cache is virtually indexed, is larger in size than
236PAGE_SIZE, and does not prevent multiple cache lines for the same 236PAGE_SIZE, and does not prevent multiple cache lines for the same
237physical address from existing at once, you have this problem. 237physical address from existing at once, you have this problem.
@@ -249,7 +249,7 @@ one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
249Next, you have to solve the D-cache aliasing issue for all 249Next, you have to solve the D-cache aliasing issue for all
250other cases. Please keep in mind that fact that, for a given page 250other cases. Please keep in mind that fact that, for a given page
251mapped into some user address space, there is always at least one more 251mapped into some user address space, there is always at least one more
252mapping, that of the kernel in it's linear mapping starting at 252mapping, that of the kernel in its linear mapping starting at
253PAGE_OFFSET. So immediately, once the first user maps a given 253PAGE_OFFSET. So immediately, once the first user maps a given
254physical page into its address space, by implication the D-cache 254physical page into its address space, by implication the D-cache
255aliasing problem has the potential to exist since the kernel already 255aliasing problem has the potential to exist since the kernel already
@@ -377,3 +377,27 @@ maps this page at its virtual address.
377 All the functionality of flush_icache_page can be implemented in 377 All the functionality of flush_icache_page can be implemented in
378 flush_dcache_page and update_mmu_cache. In 2.7 the hope is to 378 flush_dcache_page and update_mmu_cache. In 2.7 the hope is to
379 remove this interface completely. 379 remove this interface completely.
380
381The final category of APIs is for I/O to deliberately aliased address
382ranges inside the kernel. Such aliases are set up by use of the
383vmap/vmalloc API. Since kernel I/O goes via physical pages, the I/O
384subsystem assumes that the user mapping and kernel offset mapping are
385the only aliases. This isn't true for vmap aliases, so anything in
386the kernel trying to do I/O to vmap areas must manually manage
387coherency. It must do this by flushing the vmap range before doing
388I/O and invalidating it after the I/O returns.
389
390 void flush_kernel_vmap_range(void *vaddr, int size)
391 flushes the kernel cache for a given virtual address range in
392 the vmap area. This is to make sure that any data the kernel
393 modified in the vmap range is made visible to the physical
394 page. The design is to make this area safe to perform I/O on.
395 Note that this API does *not* also flush the offset map alias
396 of the area.
397
398 void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
399 the cache for a given virtual address range in the vmap area
400 which prevents the processor from making the cache stale by
401 speculatively reading data while the I/O was occurring to the
402 physical pages. This is only necessary for data reads into the
403 vmap area.
diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd
index 2c558cd6c1ef..f4dc9de2694e 100644
--- a/Documentation/cdrom/ide-cd
+++ b/Documentation/cdrom/ide-cd
@@ -159,42 +159,7 @@ two arguments: the CDROM device, and the slot number to which you wish
159to change. If the slot number is -1, the drive is unloaded. 159to change. If the slot number is -1, the drive is unloaded.
160 160
161 161
1624. Compilation options 1624. Common problems
163----------------------
164
165There are a few additional options which can be set when compiling the
166driver. Most people should not need to mess with any of these; they
167are listed here simply for completeness. A compilation option can be
168enabled by adding a line of the form `#define <option> 1' to the top
169of ide-cd.c. All these options are disabled by default.
170
171VERBOSE_IDE_CD_ERRORS
172 If this is set, ATAPI error codes will be translated into textual
173 descriptions. In addition, a dump is made of the command which
174 provoked the error. This is off by default to save the memory used
175 by the (somewhat long) table of error descriptions.
176
177STANDARD_ATAPI
178 If this is set, the code needed to deal with certain drives which do
179 not properly implement the ATAPI spec will be disabled. If you know
180 your drive implements ATAPI properly, you can turn this on to get a
181 slightly smaller kernel.
182
183NO_DOOR_LOCKING
184 If this is set, the driver will never attempt to lock the door of
185 the drive.
186
187CDROM_NBLOCKS_BUFFER
188 This sets the size of the buffer to be used for a CDROMREADAUDIO
189 ioctl. The default is 8.
190
191TEST
192 This currently enables an additional ioctl which enables a user-mode
193 program to execute an arbitrary packet command. See the source for
194 details. This should be left off unless you know what you're doing.
195
196
1975. Common problems
198------------------ 163------------------
199 164
200This section discusses some common problems encountered when trying to 165This section discusses some common problems encountered when trying to
@@ -371,7 +336,7 @@ f. Data corruption.
371 expense of low system performance. 336 expense of low system performance.
372 337
373 338
3746. cdchange.c 3395. cdchange.c
375------------- 340-------------
376 341
377/* 342/*
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
17You can do a very simple testing of running two dd threads in two different 17You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do. 18cgroups. Here is what you can do.
19 19
20- Enable Block IO controller
21 CONFIG_BLK_CGROUP=y
22
20- Enable group scheduling in CFQ 23- Enable group scheduling in CFQ
21 CONFIG_CFQ_GROUP_IOSCHED=y 24 CONFIG_CFQ_GROUP_IOSCHED=y
22 25
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
54 57
55Various user visible config options 58Various user visible config options
56=================================== 59===================================
57CONFIG_CFQ_GROUP_IOSCHED
58 - Enables group scheduling in CFQ. Currently only 1 level of group
59 creation is allowed.
60
61CONFIG_DEBUG_CFQ_IOSCHED
62 - Enables some debugging messages in blktrace. Also creates extra
63 cgroup file blkio.dequeue.
64
65Config options selected automatically
66=====================================
67These config options are not user visible and are selected/deselected
68automatically based on IO scheduler configuration.
69
70CONFIG_BLK_CGROUP 60CONFIG_BLK_CGROUP
71 - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. 61 - Block IO controller.
72 62
73CONFIG_DEBUG_BLK_CGROUP 63CONFIG_DEBUG_BLK_CGROUP
74 - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. 64 - Debug help. Right now some additional stats file show up in cgroup
65 if this option is enabled.
66
67CONFIG_CFQ_GROUP_IOSCHED
68 - Enables group scheduling in CFQ. Currently only 1 level of group
69 creation is allowed.
75 70
76Details of cgroup files 71Details of cgroup files
77======================= 72=======================
78- blkio.weight 73- blkio.weight
79 - Specifies per cgroup weight. 74 - Specifies per cgroup weight. This is default weight of the group
80 75 on all the devices until and unless overridden by per device rule.
76 (See blkio.weight_device).
81 Currently allowed range of weights is from 100 to 1000. 77 Currently allowed range of weights is from 100 to 1000.
82 78
79- blkio.weight_device
80 - One can specify per cgroup per device rules using this interface.
81 These rules override the default value of group weight as specified
82 by blkio.weight.
83
84 Following is the format.
85
86 #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
87 Configure weight=300 on /dev/sdb (8:16) in this cgroup
88 # echo 8:16 300 > blkio.weight_device
89 # cat blkio.weight_device
90 dev weight
91 8:16 300
92
93 Configure weight=500 on /dev/sda (8:0) in this cgroup
94 # echo 8:0 500 > blkio.weight_device
95 # cat blkio.weight_device
96 dev weight
97 8:0 500
98 8:16 300
99
100 Remove specific weight for /dev/sda in this cgroup
101 # echo 8:0 0 > blkio.weight_device
102 # cat blkio.weight_device
103 dev weight
104 8:16 300
105
83- blkio.time 106- blkio.time
84 - disk time allocated to cgroup per device in milliseconds. First 107 - disk time allocated to cgroup per device in milliseconds. First
85 two fields specify the major and minor number of the device and 108 two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
92 third field specifies the number of sectors transferred by the 115 third field specifies the number of sectors transferred by the
93 group to/from the device. 116 group to/from the device.
94 117
118- blkio.io_service_bytes
119 - Number of bytes transferred to/from the disk by the group. These
120 are further divided by the type of operation - read or write, sync
121 or async. First two fields specify the major and minor number of the
122 device, third field specifies the operation type and the fourth field
123 specifies the number of bytes.
124
125- blkio.io_serviced
126 - Number of IOs completed to/from the disk by the group. These
127 are further divided by the type of operation - read or write, sync
128 or async. First two fields specify the major and minor number of the
129 device, third field specifies the operation type and the fourth field
130 specifies the number of IOs.
131
132- blkio.io_service_time
133 - Total amount of time between request dispatch and request completion
134 for the IOs done by this cgroup. This is in nanoseconds to make it
135 meaningful for flash devices too. For devices with queue depth of 1,
136 this time represents the actual service time. When queue_depth > 1,
137 that is no longer true as requests may be served out of order. This
138 may cause the service time for a given IO to include the service time
139 of multiple IOs when served out of order which may result in total
140 io_service_time > actual time elapsed. This time is further divided by
141 the type of operation - read or write, sync or async. First two fields
142 specify the major and minor number of the device, third field
143 specifies the operation type and the fourth field specifies the
144 io_service_time in ns.
145
146- blkio.io_wait_time
147 - Total amount of time the IOs for this cgroup spent waiting in the
148 scheduler queues for service. This can be greater than the total time
149 elapsed since it is cumulative io_wait_time for all IOs. It is not a
150 measure of total time the cgroup spent waiting but rather a measure of
151 the wait_time for its individual IOs. For devices with queue_depth > 1
152 this metric does not include the time spent waiting for service once
153 the IO is dispatched to the device but till it actually gets serviced
154 (there might be a time lag here due to re-ordering of requests by the
155 device). This is in nanoseconds to make it meaningful for flash
156 devices too. This time is further divided by the type of operation -
157 read or write, sync or async. First two fields specify the major and
158 minor number of the device, third field specifies the operation type
159 and the fourth field specifies the io_wait_time in ns.
160
161- blkio.io_merged
162 - Total number of bios/requests merged into requests belonging to this
163 cgroup. This is further divided by the type of operation - read or
164 write, sync or async.
165
166- blkio.io_queued
167 - Total number of requests queued up at any given instant for this
168 cgroup. This is further divided by the type of operation - read or
169 write, sync or async.
170
171- blkio.avg_queue_size
172 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
173 The average queue size for this cgroup over the entire time of this
174 cgroup's existence. Queue size samples are taken each time one of the
175 queues of this cgroup gets a timeslice.
176
177- blkio.group_wait_time
178 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
179 This is the amount of time the cgroup had to wait since it became busy
180 (i.e., went from 0 to 1 request queued) to get a timeslice for one of
181 its queues. This is different from the io_wait_time which is the
182 cumulative total of the amount of time spent by each IO in that cgroup
183 waiting in the scheduler queue. This is in nanoseconds. If this is
184 read when the cgroup is in a waiting (for timeslice) state, the stat
185 will only report the group_wait_time accumulated till the last time it
186 got a timeslice and will not include the current delta.
187
188- blkio.empty_time
189 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
190 This is the amount of time a cgroup spends without any pending
191 requests when not being served, i.e., it does not include any time
192 spent idling for one of the queues of the cgroup. This is in
193 nanoseconds. If this is read when the cgroup is in an empty state,
194 the stat will only report the empty_time accumulated till the last
195 time it had a pending request and will not include the current delta.
196
197- blkio.idle_time
198 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
199 This is the amount of time spent by the IO scheduler idling for a
200 given cgroup in anticipation of a better request than the exising ones
201 from other queues/cgroups. This is in nanoseconds. If this is read
202 when the cgroup is in an idling state, the stat will only report the
203 idle_time accumulated till the last idle period and will not include
204 the current delta.
205
95- blkio.dequeue 206- blkio.dequeue
96 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 207 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
97 gives the statistics about how many a times a group was dequeued 208 gives the statistics about how many a times a group was dequeued
98 from service tree of the device. First two fields specify the major 209 from service tree of the device. First two fields specify the major
99 and minor number of the device and third field specifies the number 210 and minor number of the device and third field specifies the number
100 of times a group was dequeued from a particular device. 211 of times a group was dequeued from a particular device.
101 212
213- blkio.reset_stats
214 - Writing an int to this file will result in resetting all the stats
215 for that cgroup.
216
102CFQ sysfs tunable 217CFQ sysfs tunable
103================= 218=================
104/sys/block/<disk>/queue/iosched/group_isolation 219/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c
new file mode 100644
index 000000000000..8c2bfc4a6358
--- /dev/null
+++ b/Documentation/cgroups/cgroup_event_listener.c
@@ -0,0 +1,110 @@
1/*
2 * cgroup_event_listener.c - Simple listener of cgroup events
3 *
4 * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
5 */
6
7#include <assert.h>
8#include <errno.h>
9#include <fcntl.h>
10#include <libgen.h>
11#include <limits.h>
12#include <stdio.h>
13#include <string.h>
14#include <unistd.h>
15
16#include <sys/eventfd.h>
17
18#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
19
20int main(int argc, char **argv)
21{
22 int efd = -1;
23 int cfd = -1;
24 int event_control = -1;
25 char event_control_path[PATH_MAX];
26 char line[LINE_MAX];
27 int ret;
28
29 if (argc != 3) {
30 fputs(USAGE_STR, stderr);
31 return 1;
32 }
33
34 cfd = open(argv[1], O_RDONLY);
35 if (cfd == -1) {
36 fprintf(stderr, "Cannot open %s: %s\n", argv[1],
37 strerror(errno));
38 goto out;
39 }
40
41 ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
42 dirname(argv[1]));
43 if (ret >= PATH_MAX) {
44 fputs("Path to cgroup.event_control is too long\n", stderr);
45 goto out;
46 }
47
48 event_control = open(event_control_path, O_WRONLY);
49 if (event_control == -1) {
50 fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
51 strerror(errno));
52 goto out;
53 }
54
55 efd = eventfd(0, 0);
56 if (efd == -1) {
57 perror("eventfd() failed");
58 goto out;
59 }
60
61 ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
62 if (ret >= LINE_MAX) {
63 fputs("Arguments string is too long\n", stderr);
64 goto out;
65 }
66
67 ret = write(event_control, line, strlen(line) + 1);
68 if (ret == -1) {
69 perror("Cannot write to cgroup.event_control");
70 goto out;
71 }
72
73 while (1) {
74 uint64_t result;
75
76 ret = read(efd, &result, sizeof(result));
77 if (ret == -1) {
78 if (errno == EINTR)
79 continue;
80 perror("Cannot read from eventfd");
81 break;
82 }
83 assert(ret == sizeof(result));
84
85 ret = access(event_control_path, W_OK);
86 if ((ret == -1) && (errno == ENOENT)) {
87 puts("The cgroup seems to have removed.");
88 ret = 0;
89 break;
90 }
91
92 if (ret == -1) {
93 perror("cgroup.event_control "
94 "is not accessable any more");
95 break;
96 }
97
98 printf("%s %s: crossed\n", argv[1], argv[2]);
99 }
100
101out:
102 if (efd >= 0)
103 close(efd);
104 if (event_control >= 0)
105 close(event_control);
106 if (cfd >= 0)
107 close(cfd);
108
109 return (ret != 0);
110}
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 0b33bfe7dde9..b34823ff1646 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -22,6 +22,8 @@ CONTENTS:
222. Usage Examples and Syntax 222. Usage Examples and Syntax
23 2.1 Basic Usage 23 2.1 Basic Usage
24 2.2 Attaching processes 24 2.2 Attaching processes
25 2.3 Mounting hierarchies by name
26 2.4 Notification API
253. Kernel API 273. Kernel API
26 3.1 Overview 28 3.1 Overview
27 3.2 Synchronization 29 3.2 Synchronization
@@ -233,8 +235,7 @@ containing the following files describing that cgroup:
233 - cgroup.procs: list of tgids in the cgroup. This list is not 235 - cgroup.procs: list of tgids in the cgroup. This list is not
234 guaranteed to be sorted or free of duplicate tgids, and userspace 236 guaranteed to be sorted or free of duplicate tgids, and userspace
235 should sort/uniquify the list if this property is required. 237 should sort/uniquify the list if this property is required.
236 Writing a tgid into this file moves all threads with that tgid into 238 This is a read-only file, for now.
237 this cgroup.
238 - notify_on_release flag: run the release agent on exit? 239 - notify_on_release flag: run the release agent on exit?
239 - release_agent: the path to use for release notifications (this file 240 - release_agent: the path to use for release notifications (this file
240 exists in the top cgroup only) 241 exists in the top cgroup only)
@@ -338,7 +339,7 @@ To mount a cgroup hierarchy with all available subsystems, type:
338The "xxx" is not interpreted by the cgroup code, but will appear in 339The "xxx" is not interpreted by the cgroup code, but will appear in
339/proc/mounts so may be any useful identifying string that you like. 340/proc/mounts so may be any useful identifying string that you like.
340 341
341To mount a cgroup hierarchy with just the cpuset and numtasks 342To mount a cgroup hierarchy with just the cpuset and memory
342subsystems, type: 343subsystems, type:
343# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup 344# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup
344 345
@@ -434,6 +435,25 @@ you give a subsystem a name.
434The name of the subsystem appears as part of the hierarchy description 435The name of the subsystem appears as part of the hierarchy description
435in /proc/mounts and /proc/<pid>/cgroups. 436in /proc/mounts and /proc/<pid>/cgroups.
436 437
4382.4 Notification API
439--------------------
440
441There is mechanism which allows to get notifications about changing
442status of a cgroup.
443
444To register new notification handler you need:
445 - create a file descriptor for event notification using eventfd(2);
446 - open a control file to be monitored (e.g. memory.usage_in_bytes);
447 - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
448 Interpretation of args is defined by control file implementation;
449
450eventfd will be woken up by control file implementation or when the
451cgroup is removed.
452
453To unregister notification handler just close eventfd.
454
455NOTE: Support of notifications should be implemented for the control
456file. See documentation for the subsystem.
437 457
4383. Kernel API 4583. Kernel API
439============= 459=============
@@ -488,6 +508,11 @@ Each subsystem should:
488- add an entry in linux/cgroup_subsys.h 508- add an entry in linux/cgroup_subsys.h
489- define a cgroup_subsys object called <name>_subsys 509- define a cgroup_subsys object called <name>_subsys
490 510
511If a subsystem can be compiled as a module, it should also have in its
512module initcall a call to cgroup_load_subsys(), and in its exitcall a
513call to cgroup_unload_subsys(). It should also set its_subsys.module =
514THIS_MODULE in its .c file.
515
491Each subsystem may export the following methods. The only mandatory 516Each subsystem may export the following methods. The only mandatory
492methods are create/destroy. Any others that are null are presumed to 517methods are create/destroy. Any others that are null are presumed to
493be successful no-ops. 518be successful no-ops.
@@ -536,10 +561,21 @@ returns an error, this will abort the attach operation. If a NULL
536task is passed, then a successful result indicates that *any* 561task is passed, then a successful result indicates that *any*
537unspecified task can be moved into the cgroup. Note that this isn't 562unspecified task can be moved into the cgroup. Note that this isn't
538called on a fork. If this method returns 0 (success) then this should 563called on a fork. If this method returns 0 (success) then this should
539remain valid while the caller holds cgroup_mutex. If threadgroup is 564remain valid while the caller holds cgroup_mutex and it is ensured that either
565attach() or cancel_attach() will be called in future. If threadgroup is
540true, then a successful result indicates that all threads in the given 566true, then a successful result indicates that all threads in the given
541thread's threadgroup can be moved together. 567thread's threadgroup can be moved together.
542 568
569void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
570 struct task_struct *task, bool threadgroup)
571(cgroup_mutex held by caller)
572
573Called when a task attach operation has failed after can_attach() has succeeded.
574A subsystem whose can_attach() has some side-effects should provide this
575function, so that the subsystem can implement a rollback. If not, not necessary.
576This will be called only about subsystems whose can_attach() operation have
577succeeded.
578
543void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 579void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
544 struct cgroup *old_cgrp, struct task_struct *task, 580 struct cgroup *old_cgrp, struct task_struct *task,
545 bool threadgroup) 581 bool threadgroup)
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 1d7e9784439a..51682ab2dd1a 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -42,7 +42,7 @@ Nodes to a set of tasks. In this document "Memory Node" refers to
42an on-line node that contains memory. 42an on-line node that contains memory.
43 43
44Cpusets constrain the CPU and Memory placement of tasks to only 44Cpusets constrain the CPU and Memory placement of tasks to only
45the resources within a tasks current cpuset. They form a nested 45the resources within a task's current cpuset. They form a nested
46hierarchy visible in a virtual file system. These are the essential 46hierarchy visible in a virtual file system. These are the essential
47hooks, beyond what is already present, required to manage dynamic 47hooks, beyond what is already present, required to manage dynamic
48job placement on large systems. 48job placement on large systems.
@@ -53,11 +53,11 @@ Documentation/cgroups/cgroups.txt.
53Requests by a task, using the sched_setaffinity(2) system call to 53Requests by a task, using the sched_setaffinity(2) system call to
54include CPUs in its CPU affinity mask, and using the mbind(2) and 54include CPUs in its CPU affinity mask, and using the mbind(2) and
55set_mempolicy(2) system calls to include Memory Nodes in its memory 55set_mempolicy(2) system calls to include Memory Nodes in its memory
56policy, are both filtered through that tasks cpuset, filtering out any 56policy, are both filtered through that task's cpuset, filtering out any
57CPUs or Memory Nodes not in that cpuset. The scheduler will not 57CPUs or Memory Nodes not in that cpuset. The scheduler will not
58schedule a task on a CPU that is not allowed in its cpus_allowed 58schedule a task on a CPU that is not allowed in its cpus_allowed
59vector, and the kernel page allocator will not allocate a page on a 59vector, and the kernel page allocator will not allocate a page on a
60node that is not allowed in the requesting tasks mems_allowed vector. 60node that is not allowed in the requesting task's mems_allowed vector.
61 61
62User level code may create and destroy cpusets by name in the cgroup 62User level code may create and destroy cpusets by name in the cgroup
63virtual file system, manage the attributes and permissions of these 63virtual file system, manage the attributes and permissions of these
@@ -121,9 +121,9 @@ Cpusets extends these two mechanisms as follows:
121 - Each task in the system is attached to a cpuset, via a pointer 121 - Each task in the system is attached to a cpuset, via a pointer
122 in the task structure to a reference counted cgroup structure. 122 in the task structure to a reference counted cgroup structure.
123 - Calls to sched_setaffinity are filtered to just those CPUs 123 - Calls to sched_setaffinity are filtered to just those CPUs
124 allowed in that tasks cpuset. 124 allowed in that task's cpuset.
125 - Calls to mbind and set_mempolicy are filtered to just 125 - Calls to mbind and set_mempolicy are filtered to just
126 those Memory Nodes allowed in that tasks cpuset. 126 those Memory Nodes allowed in that task's cpuset.
127 - The root cpuset contains all the systems CPUs and Memory 127 - The root cpuset contains all the systems CPUs and Memory
128 Nodes. 128 Nodes.
129 - For any cpuset, one can define child cpusets containing a subset 129 - For any cpuset, one can define child cpusets containing a subset
@@ -141,11 +141,11 @@ into the rest of the kernel, none in performance critical paths:
141 - in init/main.c, to initialize the root cpuset at system boot. 141 - in init/main.c, to initialize the root cpuset at system boot.
142 - in fork and exit, to attach and detach a task from its cpuset. 142 - in fork and exit, to attach and detach a task from its cpuset.
143 - in sched_setaffinity, to mask the requested CPUs by what's 143 - in sched_setaffinity, to mask the requested CPUs by what's
144 allowed in that tasks cpuset. 144 allowed in that task's cpuset.
145 - in sched.c migrate_live_tasks(), to keep migrating tasks within 145 - in sched.c migrate_live_tasks(), to keep migrating tasks within
146 the CPUs allowed by their cpuset, if possible. 146 the CPUs allowed by their cpuset, if possible.
147 - in the mbind and set_mempolicy system calls, to mask the requested 147 - in the mbind and set_mempolicy system calls, to mask the requested
148 Memory Nodes by what's allowed in that tasks cpuset. 148 Memory Nodes by what's allowed in that task's cpuset.
149 - in page_alloc.c, to restrict memory to allowed nodes. 149 - in page_alloc.c, to restrict memory to allowed nodes.
150 - in vmscan.c, to restrict page recovery to the current cpuset. 150 - in vmscan.c, to restrict page recovery to the current cpuset.
151 151
@@ -155,7 +155,7 @@ new system calls are added for cpusets - all support for querying and
155modifying cpusets is via this cpuset file system. 155modifying cpusets is via this cpuset file system.
156 156
157The /proc/<pid>/status file for each task has four added lines, 157The /proc/<pid>/status file for each task has four added lines,
158displaying the tasks cpus_allowed (on which CPUs it may be scheduled) 158displaying the task's cpus_allowed (on which CPUs it may be scheduled)
159and mems_allowed (on which Memory Nodes it may obtain memory), 159and mems_allowed (on which Memory Nodes it may obtain memory),
160in the two formats seen in the following example: 160in the two formats seen in the following example:
161 161
@@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system
168containing (on top of the standard cgroup files) the following 168containing (on top of the standard cgroup files) the following
169files describing that cpuset: 169files describing that cpuset:
170 170
171 - cpus: list of CPUs in that cpuset 171 - cpuset.cpus: list of CPUs in that cpuset
172 - mems: list of Memory Nodes in that cpuset 172 - cpuset.mems: list of Memory Nodes in that cpuset
173 - memory_migrate flag: if set, move pages to cpusets nodes 173 - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
174 - cpu_exclusive flag: is cpu placement exclusive? 174 - cpuset.cpu_exclusive flag: is cpu placement exclusive?
175 - mem_exclusive flag: is memory placement exclusive? 175 - cpuset.mem_exclusive flag: is memory placement exclusive?
176 - mem_hardwall flag: is memory allocation hardwalled 176 - cpuset.mem_hardwall flag: is memory allocation hardwalled
177 - memory_pressure: measure of how much paging pressure in cpuset 177 - cpuset.memory_pressure: measure of how much paging pressure in cpuset
178 - memory_spread_page flag: if set, spread page cache evenly on allowed nodes 178 - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
179 - memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes 179 - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
180 - sched_load_balance flag: if set, load balance within CPUs on that cpuset 180 - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
181 - sched_relax_domain_level: the searching range when migrating tasks 181 - cpuset.sched_relax_domain_level: the searching range when migrating tasks
182 182
183In addition, the root cpuset only has the following file: 183In addition, the root cpuset only has the following file:
184 - memory_pressure_enabled flag: compute memory_pressure? 184 - cpuset.memory_pressure_enabled flag: compute memory_pressure?
185 185
186New cpusets are created using the mkdir system call or shell 186New cpusets are created using the mkdir system call or shell
187command. The properties of a cpuset, such as its flags, allowed 187command. The properties of a cpuset, such as its flags, allowed
@@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
229a direct ancestor or descendant, may share any of the same CPUs or 229a direct ancestor or descendant, may share any of the same CPUs or
230Memory Nodes. 230Memory Nodes.
231 231
232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", 232A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
233i.e. it restricts kernel allocations for page, buffer and other data 233i.e. it restricts kernel allocations for page, buffer and other data
234commonly shared by the kernel across multiple users. All cpusets, 234commonly shared by the kernel across multiple users. All cpusets,
235whether hardwalled or not, restrict allocations of memory for user 235whether hardwalled or not, restrict allocations of memory for user
@@ -304,15 +304,15 @@ times 1000.
304--------------------------- 304---------------------------
305There are two boolean flag files per cpuset that control where the 305There are two boolean flag files per cpuset that control where the
306kernel allocates pages for the file system buffers and related in 306kernel allocates pages for the file system buffers and related in
307kernel data structures. They are called 'memory_spread_page' and 307kernel data structures. They are called 'cpuset.memory_spread_page' and
308'memory_spread_slab'. 308'cpuset.memory_spread_slab'.
309 309
310If the per-cpuset boolean flag file 'memory_spread_page' is set, then 310If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
311the kernel will spread the file system buffers (page cache) evenly 311the kernel will spread the file system buffers (page cache) evenly
312over all the nodes that the faulting task is allowed to use, instead 312over all the nodes that the faulting task is allowed to use, instead
313of preferring to put those pages on the node where the task is running. 313of preferring to put those pages on the node where the task is running.
314 314
315If the per-cpuset boolean flag file 'memory_spread_slab' is set, 315If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
316then the kernel will spread some file system related slab caches, 316then the kernel will spread some file system related slab caches,
317such as for inodes and dentries evenly over all the nodes that the 317such as for inodes and dentries evenly over all the nodes that the
318faulting task is allowed to use, instead of preferring to put those 318faulting task is allowed to use, instead of preferring to put those
@@ -323,41 +323,41 @@ stack segment pages of a task.
323 323
324By default, both kinds of memory spreading are off, and memory 324By default, both kinds of memory spreading are off, and memory
325pages are allocated on the node local to where the task is running, 325pages are allocated on the node local to where the task is running,
326except perhaps as modified by the tasks NUMA mempolicy or cpuset 326except perhaps as modified by the task's NUMA mempolicy or cpuset
327configuration, so long as sufficient free memory pages are available. 327configuration, so long as sufficient free memory pages are available.
328 328
329When new cpusets are created, they inherit the memory spread settings 329When new cpusets are created, they inherit the memory spread settings
330of their parent. 330of their parent.
331 331
332Setting memory spreading causes allocations for the affected page 332Setting memory spreading causes allocations for the affected page
333or slab caches to ignore the tasks NUMA mempolicy and be spread 333or slab caches to ignore the task's NUMA mempolicy and be spread
334instead. Tasks using mbind() or set_mempolicy() calls to set NUMA 334instead. Tasks using mbind() or set_mempolicy() calls to set NUMA
335mempolicies will not notice any change in these calls as a result of 335mempolicies will not notice any change in these calls as a result of
336their containing tasks memory spread settings. If memory spreading 336their containing task's memory spread settings. If memory spreading
337is turned off, then the currently specified NUMA mempolicy once again 337is turned off, then the currently specified NUMA mempolicy once again
338applies to memory page allocations. 338applies to memory page allocations.
339 339
340Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag 340Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
341files. By default they contain "0", meaning that the feature is off 341files. By default they contain "0", meaning that the feature is off
342for that cpuset. If a "1" is written to that file, then that turns 342for that cpuset. If a "1" is written to that file, then that turns
343the named feature on. 343the named feature on.
344 344
345The implementation is simple. 345The implementation is simple.
346 346
347Setting the flag 'memory_spread_page' turns on a per-process flag 347Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
348PF_SPREAD_PAGE for each task that is in that cpuset or subsequently 348PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
349joins that cpuset. The page allocation calls for the page cache 349joins that cpuset. The page allocation calls for the page cache
350is modified to perform an inline check for this PF_SPREAD_PAGE task 350is modified to perform an inline check for this PF_SPREAD_PAGE task
351flag, and if set, a call to a new routine cpuset_mem_spread_node() 351flag, and if set, a call to a new routine cpuset_mem_spread_node()
352returns the node to prefer for the allocation. 352returns the node to prefer for the allocation.
353 353
354Similarly, setting 'memory_spread_slab' turns on the flag 354Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
355PF_SPREAD_SLAB, and appropriately marked slab caches will allocate 355PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
356pages from the node returned by cpuset_mem_spread_node(). 356pages from the node returned by cpuset_mem_spread_node().
357 357
358The cpuset_mem_spread_node() routine is also simple. It uses the 358The cpuset_mem_spread_node() routine is also simple. It uses the
359value of a per-task rotor cpuset_mem_spread_rotor to select the next 359value of a per-task rotor cpuset_mem_spread_rotor to select the next
360node in the current tasks mems_allowed to prefer for the allocation. 360node in the current task's mems_allowed to prefer for the allocation.
361 361
362This memory placement policy is also known (in other contexts) as 362This memory placement policy is also known (in other contexts) as
363round-robin or interleave. 363round-robin or interleave.
@@ -404,24 +404,24 @@ the following two situations:
404 system overhead on those CPUs, including avoiding task load 404 system overhead on those CPUs, including avoiding task load
405 balancing if that is not needed. 405 balancing if that is not needed.
406 406
407When the per-cpuset flag "sched_load_balance" is enabled (the default 407When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
408setting), it requests that all the CPUs in that cpusets allowed 'cpus' 408setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
409be contained in a single sched domain, ensuring that load balancing 409be contained in a single sched domain, ensuring that load balancing
410can move a task (not otherwised pinned, as by sched_setaffinity) 410can move a task (not otherwised pinned, as by sched_setaffinity)
411from any CPU in that cpuset to any other. 411from any CPU in that cpuset to any other.
412 412
413When the per-cpuset flag "sched_load_balance" is disabled, then the 413When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
414scheduler will avoid load balancing across the CPUs in that cpuset, 414scheduler will avoid load balancing across the CPUs in that cpuset,
415--except-- in so far as is necessary because some overlapping cpuset 415--except-- in so far as is necessary because some overlapping cpuset
416has "sched_load_balance" enabled. 416has "sched_load_balance" enabled.
417 417
418So, for example, if the top cpuset has the flag "sched_load_balance" 418So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
419enabled, then the scheduler will have one sched domain covering all 419enabled, then the scheduler will have one sched domain covering all
420CPUs, and the setting of the "sched_load_balance" flag in any other 420CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
421cpusets won't matter, as we're already fully load balancing. 421cpusets won't matter, as we're already fully load balancing.
422 422
423Therefore in the above two situations, the top cpuset flag 423Therefore in the above two situations, the top cpuset flag
424"sched_load_balance" should be disabled, and only some of the smaller, 424"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
425child cpusets have this flag enabled. 425child cpusets have this flag enabled.
426 426
427When doing this, you don't usually want to leave any unpinned tasks in 427When doing this, you don't usually want to leave any unpinned tasks in
@@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that
433task to that underused CPU. 433task to that underused CPU.
434 434
435Of course, tasks pinned to a particular CPU can be left in a cpuset 435Of course, tasks pinned to a particular CPU can be left in a cpuset
436that disables "sched_load_balance" as those tasks aren't going anywhere 436that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
437else anyway. 437else anyway.
438 438
439There is an impedance mismatch here, between cpusets and sched domains. 439There is an impedance mismatch here, between cpusets and sched domains.
@@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain.
443It is necessary for sched domains to be flat because load balancing 443It is necessary for sched domains to be flat because load balancing
444across partially overlapping sets of CPUs would risk unstable dynamics 444across partially overlapping sets of CPUs would risk unstable dynamics
445that would be beyond our understanding. So if each of two partially 445that would be beyond our understanding. So if each of two partially
446overlapping cpusets enables the flag 'sched_load_balance', then we 446overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
447form a single sched domain that is a superset of both. We won't move 447form a single sched domain that is a superset of both. We won't move
448a task to a CPU outside it cpuset, but the scheduler load balancing 448a task to a CPU outside it cpuset, but the scheduler load balancing
449code might waste some compute cycles considering that possibility. 449code might waste some compute cycles considering that possibility.
450 450
451This mismatch is why there is not a simple one-to-one relation 451This mismatch is why there is not a simple one-to-one relation
452between which cpusets have the flag "sched_load_balance" enabled, 452between which cpusets have the flag "cpuset.sched_load_balance" enabled,
453and the sched domain configuration. If a cpuset enables the flag, it 453and the sched domain configuration. If a cpuset enables the flag, it
454will get balancing across all its CPUs, but if it disables the flag, 454will get balancing across all its CPUs, but if it disables the flag,
455it will only be assured of no load balancing if no other overlapping 455it will only be assured of no load balancing if no other overlapping
456cpuset enables the flag. 456cpuset enables the flag.
457 457
458If two cpusets have partially overlapping 'cpus' allowed, and only 458If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
459one of them has this flag enabled, then the other may find its 459one of them has this flag enabled, then the other may find its
460tasks only partially load balanced, just on the overlapping CPUs. 460tasks only partially load balanced, just on the overlapping CPUs.
461This is just the general case of the top_cpuset example given a few 461This is just the general case of the top_cpuset example given a few
@@ -468,23 +468,23 @@ load balancing to the other CPUs.
4681.7.1 sched_load_balance implementation details. 4681.7.1 sched_load_balance implementation details.
469------------------------------------------------ 469------------------------------------------------
470 470
471The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary 471The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
472to most cpuset flags.) When enabled for a cpuset, the kernel will 472to most cpuset flags.) When enabled for a cpuset, the kernel will
473ensure that it can load balance across all the CPUs in that cpuset 473ensure that it can load balance across all the CPUs in that cpuset
474(makes sure that all the CPUs in the cpus_allowed of that cpuset are 474(makes sure that all the CPUs in the cpus_allowed of that cpuset are
475in the same sched domain.) 475in the same sched domain.)
476 476
477If two overlapping cpusets both have 'sched_load_balance' enabled, 477If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
478then they will be (must be) both in the same sched domain. 478then they will be (must be) both in the same sched domain.
479 479
480If, as is the default, the top cpuset has 'sched_load_balance' enabled, 480If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
481then by the above that means there is a single sched domain covering 481then by the above that means there is a single sched domain covering
482the whole system, regardless of any other cpuset settings. 482the whole system, regardless of any other cpuset settings.
483 483
484The kernel commits to user space that it will avoid load balancing 484The kernel commits to user space that it will avoid load balancing
485where it can. It will pick as fine a granularity partition of sched 485where it can. It will pick as fine a granularity partition of sched
486domains as it can while still providing load balancing for any set 486domains as it can while still providing load balancing for any set
487of CPUs allowed to a cpuset having 'sched_load_balance' enabled. 487of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
488 488
489The internal kernel cpuset to scheduler interface passes from the 489The internal kernel cpuset to scheduler interface passes from the
490cpuset code to the scheduler code a partition of the load balanced 490cpuset code to the scheduler code a partition of the load balanced
@@ -495,9 +495,9 @@ all the CPUs that must be load balanced.
495The cpuset code builds a new such partition and passes it to the 495The cpuset code builds a new such partition and passes it to the
496scheduler sched domain setup code, to have the sched domains rebuilt 496scheduler sched domain setup code, to have the sched domains rebuilt
497as necessary, whenever: 497as necessary, whenever:
498 - the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes, 498 - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
499 - or CPUs come or go from a cpuset with this flag enabled, 499 - or CPUs come or go from a cpuset with this flag enabled,
500 - or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs 500 - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
501 and with this flag enabled changes, 501 and with this flag enabled changes,
502 - or a cpuset with non-empty CPUs and with this flag enabled is removed, 502 - or a cpuset with non-empty CPUs and with this flag enabled is removed,
503 - or a cpu is offlined/onlined. 503 - or a cpu is offlined/onlined.
@@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance
542on the next tick. For some applications in special situation, waiting 542on the next tick. For some applications in special situation, waiting
5431 tick may be too long. 5431 tick may be too long.
544 544
545The 'sched_relax_domain_level' file allows you to request changing 545The 'cpuset.sched_relax_domain_level' file allows you to request changing
546this searching range as you like. This file takes int value which 546this searching range as you like. This file takes int value which
547indicates size of searching range in levels ideally as follows, 547indicates size of searching range in levels ideally as follows,
548otherwise initial value -1 that indicates the cpuset has no request. 548otherwise initial value -1 that indicates the cpuset has no request.
@@ -559,8 +559,8 @@ The system default is architecture dependent. The system default
559can be changed using the relax_domain_level= boot parameter. 559can be changed using the relax_domain_level= boot parameter.
560 560
561This file is per-cpuset and affect the sched domain where the cpuset 561This file is per-cpuset and affect the sched domain where the cpuset
562belongs to. Therefore if the flag 'sched_load_balance' of a cpuset 562belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
563is disabled, then 'sched_relax_domain_level' have no effect since 563is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
564there is no sched domain belonging the cpuset. 564there is no sched domain belonging the cpuset.
565 565
566If multiple cpusets are overlapping and hence they form a single sched 566If multiple cpusets are overlapping and hence they form a single sched
@@ -594,7 +594,7 @@ is attached, is subtle.
594If a cpuset has its Memory Nodes modified, then for each task attached 594If a cpuset has its Memory Nodes modified, then for each task attached
595to that cpuset, the next time that the kernel attempts to allocate 595to that cpuset, the next time that the kernel attempts to allocate
596a page of memory for that task, the kernel will notice the change 596a page of memory for that task, the kernel will notice the change
597in the tasks cpuset, and update its per-task memory placement to 597in the task's cpuset, and update its per-task memory placement to
598remain within the new cpusets memory placement. If the task was using 598remain within the new cpusets memory placement. If the task was using
599mempolicy MPOL_BIND, and the nodes to which it was bound overlap with 599mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
600its new cpuset, then the task will continue to use whatever subset 600its new cpuset, then the task will continue to use whatever subset
@@ -603,13 +603,13 @@ was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
603in the new cpuset, then the task will be essentially treated as if it 603in the new cpuset, then the task will be essentially treated as if it
604was MPOL_BIND bound to the new cpuset (even though its NUMA placement, 604was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
605as queried by get_mempolicy(), doesn't change). If a task is moved 605as queried by get_mempolicy(), doesn't change). If a task is moved
606from one cpuset to another, then the kernel will adjust the tasks 606from one cpuset to another, then the kernel will adjust the task's
607memory placement, as above, the next time that the kernel attempts 607memory placement, as above, the next time that the kernel attempts
608to allocate a page of memory for that task. 608to allocate a page of memory for that task.
609 609
610If a cpuset has its 'cpus' modified, then each task in that cpuset 610If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
611will have its allowed CPU placement changed immediately. Similarly, 611will have its allowed CPU placement changed immediately. Similarly,
612if a tasks pid is written to another cpusets 'tasks' file, then its 612if a task's pid is written to another cpusets 'cpuset.tasks' file, then its
613allowed CPU placement is changed immediately. If such a task had been 613allowed CPU placement is changed immediately. If such a task had been
614bound to some subset of its cpuset using the sched_setaffinity() call, 614bound to some subset of its cpuset using the sched_setaffinity() call,
615the task will be allowed to run on any CPU allowed in its new cpuset, 615the task will be allowed to run on any CPU allowed in its new cpuset,
@@ -622,21 +622,21 @@ and the processor placement is updated immediately.
622Normally, once a page is allocated (given a physical page 622Normally, once a page is allocated (given a physical page
623of main memory) then that page stays on whatever node it 623of main memory) then that page stays on whatever node it
624was allocated, so long as it remains allocated, even if the 624was allocated, so long as it remains allocated, even if the
625cpusets memory placement policy 'mems' subsequently changes. 625cpusets memory placement policy 'cpuset.mems' subsequently changes.
626If the cpuset flag file 'memory_migrate' is set true, then when 626If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
627tasks are attached to that cpuset, any pages that task had 627tasks are attached to that cpuset, any pages that task had
628allocated to it on nodes in its previous cpuset are migrated 628allocated to it on nodes in its previous cpuset are migrated
629to the tasks new cpuset. The relative placement of the page within 629to the task's new cpuset. The relative placement of the page within
630the cpuset is preserved during these migration operations if possible. 630the cpuset is preserved during these migration operations if possible.
631For example if the page was on the second valid node of the prior cpuset 631For example if the page was on the second valid node of the prior cpuset
632then the page will be placed on the second valid node of the new cpuset. 632then the page will be placed on the second valid node of the new cpuset.
633 633
634Also if 'memory_migrate' is set true, then if that cpusets 634Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
635'mems' file is modified, pages allocated to tasks in that 635'cpuset.mems' file is modified, pages allocated to tasks in that
636cpuset, that were on nodes in the previous setting of 'mems', 636cpuset, that were on nodes in the previous setting of 'cpuset.mems',
637will be moved to nodes in the new setting of 'mems.' 637will be moved to nodes in the new setting of 'mems.'
638Pages that were not in the tasks prior cpuset, or in the cpusets 638Pages that were not in the task's prior cpuset, or in the cpuset's
639prior 'mems' setting, will not be moved. 639prior 'cpuset.mems' setting, will not be moved.
640 640
641There is an exception to the above. If hotplug functionality is used 641There is an exception to the above. If hotplug functionality is used
642to remove all the CPUs that are currently assigned to a cpuset, 642to remove all the CPUs that are currently assigned to a cpuset,
@@ -655,7 +655,7 @@ There is a second exception to the above. GFP_ATOMIC requests are
655kernel internal allocations that must be satisfied, immediately. 655kernel internal allocations that must be satisfied, immediately.
656The kernel may drop some request, in rare cases even panic, if a 656The kernel may drop some request, in rare cases even panic, if a
657GFP_ATOMIC alloc fails. If the request cannot be satisfied within 657GFP_ATOMIC alloc fails. If the request cannot be satisfied within
658the current tasks cpuset, then we relax the cpuset, and look for 658the current task's cpuset, then we relax the cpuset, and look for
659memory anywhere we can find it. It's better to violate the cpuset 659memory anywhere we can find it. It's better to violate the cpuset
660than stress the kernel. 660than stress the kernel.
661 661
@@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset:
678 cd /dev/cpuset 678 cd /dev/cpuset
679 mkdir Charlie 679 mkdir Charlie
680 cd Charlie 680 cd Charlie
681 /bin/echo 2-3 > cpus 681 /bin/echo 2-3 > cpuset.cpus
682 /bin/echo 1 > mems 682 /bin/echo 1 > cpuset.mems
683 /bin/echo $$ > tasks 683 /bin/echo $$ > tasks
684 sh 684 sh
685 # The subshell 'sh' is now running in cpuset Charlie 685 # The subshell 'sh' is now running in cpuset Charlie
@@ -725,10 +725,13 @@ Now you want to do something with this cpuset.
725 725
726In this directory you can find several files: 726In this directory you can find several files:
727# ls 727# ls
728cpu_exclusive memory_migrate mems tasks 728cpuset.cpu_exclusive cpuset.memory_spread_slab
729cpus memory_pressure notify_on_release 729cpuset.cpus cpuset.mems
730mem_exclusive memory_spread_page sched_load_balance 730cpuset.mem_exclusive cpuset.sched_load_balance
731mem_hardwall memory_spread_slab sched_relax_domain_level 731cpuset.mem_hardwall cpuset.sched_relax_domain_level
732cpuset.memory_migrate notify_on_release
733cpuset.memory_pressure tasks
734cpuset.memory_spread_page
732 735
733Reading them will give you information about the state of this cpuset: 736Reading them will give you information about the state of this cpuset:
734the CPUs and Memory Nodes it can use, the processes that are using 737the CPUs and Memory Nodes it can use, the processes that are using
@@ -736,13 +739,13 @@ it, its properties. By writing to these files you can manipulate
736the cpuset. 739the cpuset.
737 740
738Set some flags: 741Set some flags:
739# /bin/echo 1 > cpu_exclusive 742# /bin/echo 1 > cpuset.cpu_exclusive
740 743
741Add some cpus: 744Add some cpus:
742# /bin/echo 0-7 > cpus 745# /bin/echo 0-7 > cpuset.cpus
743 746
744Add some mems: 747Add some mems:
745# /bin/echo 0-7 > mems 748# /bin/echo 0-7 > cpuset.mems
746 749
747Now attach your shell to this cpuset: 750Now attach your shell to this cpuset:
748# /bin/echo $$ > tasks 751# /bin/echo $$ > tasks
@@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
774This is the syntax to use when writing in the cpus or mems files 777This is the syntax to use when writing in the cpus or mems files
775in cpuset directories: 778in cpuset directories:
776 779
777# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 780# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
778# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 781# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
779 782
780To add a CPU to a cpuset, write the new list of CPUs including the 783To add a CPU to a cpuset, write the new list of CPUs including the
781CPU to be added. To add 6 to the above cpuset: 784CPU to be added. To add 6 to the above cpuset:
782 785
783# /bin/echo 1-4,6 > cpus -> set cpus list to cpus 1,2,3,4,6 786# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
784 787
785Similarly to remove a CPU from a cpuset, write the new list of CPUs 788Similarly to remove a CPU from a cpuset, write the new list of CPUs
786without the CPU to be removed. 789without the CPU to be removed.
787 790
788To remove all the CPUs: 791To remove all the CPUs:
789 792
790# /bin/echo "" > cpus -> clear cpus list 793# /bin/echo "" > cpuset.cpus -> clear cpus list
791 794
7922.3 Setting flags 7952.3 Setting flags
793----------------- 796-----------------
794 797
795The syntax is very simple: 798The syntax is very simple:
796 799
797# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' 800# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
798# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' 801# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
799 802
8002.4 Attaching processes 8032.4 Attaching processes
801----------------------- 804-----------------------
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 72db89ed0609..b7eececfb195 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,6 +1,6 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/20 2Last Updated: 2010/2
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
6is complex. This is a document for memcg's internal behavior. 6is complex. This is a document for memcg's internal behavior.
@@ -244,7 +244,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
244 we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). 244 we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
245 245
2468. LRU 2468. LRU
247 Each memcg has its own private LRU. Now, it's handling is under global 247 Each memcg has its own private LRU. Now, its handling is under global
248 VM's control (means that it's handled under global zone->lru_lock). 248 VM's control (means that it's handled under global zone->lru_lock).
249 Almost all routines around memcg's LRU is called by global LRU's 249 Almost all routines around memcg's LRU is called by global LRU's
250 list management functions under zone->lru_lock(). 250 list management functions under zone->lru_lock().
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
337 race and lock dependency with other cgroup subsystems. 337 race and lock dependency with other cgroup subsystems.
338 338
339 example) 339 example)
340 # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices 340 # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
341 341
342 and do task move, mkdir, rmdir etc...under this. 342 and do task move, mkdir, rmdir etc...under this.
343 343
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
348 348
349 For example, test like following is good. 349 For example, test like following is good.
350 (Shell-A) 350 (Shell-A)
351 # mount -t cgroup none /cgroup -t memory 351 # mount -t cgroup none /cgroup -o memory
352 # mkdir /cgroup/test 352 # mkdir /cgroup/test
353 # echo 40M > /cgroup/test/memory.limit_in_bytes 353 # echo 40M > /cgroup/test/memory.limit_in_bytes
354 # echo 0 > /cgroup/test/tasks 354 # echo 0 > /cgroup/test/tasks
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
378 #echo 50M > memory.limit_in_bytes 378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes 379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc 380 run 51M of malloc
381
382 9.9 Move charges at task migration
383 Charges associated with a task can be moved along with task migration.
384
385 (Shell-A)
386 #mkdir /cgroup/A
387 #echo $$ >/cgroup/A/tasks
388 run some programs which uses some amount of memory in /cgroup/A.
389
390 (Shell-B)
391 #mkdir /cgroup/B
392 #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
393 #echo "pid of the program running in group A" >/cgroup/B/tasks
394
395 You can see charges have been moved by reading *.usage_in_bytes or
396 memory.stat of both A and B.
397 See 8.2 of Documentation/cgroups/memory.txt to see what value should be
398 written to move_charge_at_immigrate.
399
400 9.10 Memory thresholds
401 Memory controler implements memory thresholds using cgroups notification
402 API. You can use Documentation/cgroups/cgroup_event_listener.c to test
403 it.
404
405 (Shell-A) Create cgroup and run event listener
406 # mkdir /cgroup/A
407 # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
408
409 (Shell-B) Add task to cgroup and try to allocate and free memory
410 # echo $$ >/cgroup/A/tasks
411 # a="$(dd if=/dev/zero bs=1M count=10)"
412 # a=
413
414 You will see message from cgroup_event_listener every time you cross
415 the thresholds.
416
417 Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
418
419 It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index b871f2552b45..7781857dc940 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,18 +1,15 @@
1Memory Resource Controller 1Memory Resource Controller
2 2
3NOTE: The Memory Resource Controller has been generically been referred 3NOTE: The Memory Resource Controller has been generically been referred
4to as the memory controller in this document. Do not confuse memory controller 4 to as the memory controller in this document. Do not confuse memory
5used here with the memory controller that is used in hardware. 5 controller used here with the memory controller that is used in hardware.
6 6
7Salient features 7(For editors)
8 8In this document:
9a. Enable control of Anonymous, Page Cache (mapped and unmapped) and 9 When we mention a cgroup (cgroupfs's directory) with memory controller,
10 Swap Cache memory pages. 10 we call it "memory cgroup". When you see git-log and source code, you'll
11b. The infrastructure allows easy addition of other types of memory to control 11 see patch's title and function names tend to use "memcg".
12c. Provides *zero overhead* for non memory controller users 12 In this document, we avoid using it.
13d. Provides a double LRU: global memory pressure causes reclaim from the
14 global LRU; a cgroup on hitting a limit, reclaims from the per
15 cgroup LRU
16 13
17Benefits and Purpose of the memory controller 14Benefits and Purpose of the memory controller
18 15
@@ -33,6 +30,45 @@ d. A CD/DVD burner could control the amount of memory used by the
33e. There are several other use cases, find one or use the controller just 30e. There are several other use cases, find one or use the controller just
34 for fun (to learn and hack on the VM subsystem). 31 for fun (to learn and hack on the VM subsystem).
35 32
33Current Status: linux-2.6.34-mmotm(development version of 2010/April)
34
35Features:
36 - accounting anonymous pages, file caches, swap caches usage and limiting them.
37 - private LRU and reclaim routine. (system's global LRU and private LRU
38 work independently from each other)
39 - optionally, memory+swap usage can be accounted and limited.
40 - hierarchical accounting
41 - soft limit
42 - moving(recharging) account at moving a task is selectable.
43 - usage threshold notifier
44 - oom-killer disable knob and oom-notifier
45 - Root cgroup has no limit controls.
46
47 Kernel memory and Hugepages are not under control yet. We just manage
48 pages on LRU. To add more controls, we have to take care of performance.
49
50Brief summary of control files.
51
52 tasks # attach a task(thread) and show list of threads
53 cgroup.procs # show list of processes
54 cgroup.event_control # an interface for event_fd()
55 memory.usage_in_bytes # show current memory(RSS+Cache) usage.
56 memory.memsw.usage_in_bytes # show current memory+Swap usage
57 memory.limit_in_bytes # set/show limit of memory usage
58 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
59 memory.failcnt # show the number of memory usage hits limits
60 memory.memsw.failcnt # show the number of memory+Swap hits limits
61 memory.max_usage_in_bytes # show max memory usage recorded
62 memory.memsw.usage_in_bytes # show max memory+Swap usage recorded
63 memory.soft_limit_in_bytes # set/show soft limit of memory usage
64 memory.stat # show various statistics
65 memory.use_hierarchy # set/show hierarchical account enabled
66 memory.force_empty # trigger forced move charge to parent
67 memory.swappiness # set/show swappiness parameter of vmscan
68 (See sysctl's vm.swappiness)
69 memory.move_charge_at_immigrate # set/show controls of moving charges
70 memory.oom_control # set/show oom controls.
71
361. History 721. History
37 73
38The memory controller has a long history. A request for comments for the memory 74The memory controller has a long history. A request for comments for the memory
@@ -106,14 +142,14 @@ the necessary data structures and check if the cgroup that is being charged
106is over its limit. If it is then reclaim is invoked on the cgroup. 142is over its limit. If it is then reclaim is invoked on the cgroup.
107More details can be found in the reclaim section of this document. 143More details can be found in the reclaim section of this document.
108If everything goes well, a page meta-data-structure called page_cgroup is 144If everything goes well, a page meta-data-structure called page_cgroup is
109allocated and associated with the page. This routine also adds the page to 145updated. page_cgroup has its own LRU on cgroup.
110the per cgroup LRU. 146(*) page_cgroup structure is allocated at boot/memory-hotplug time.
111 147
1122.2.1 Accounting details 1482.2.1 Accounting details
113 149
114All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. 150All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
115(some pages which never be reclaimable and will not be on global LRU 151Some pages which are never reclaimable and will not be on the global LRU
116 are not accounted. we just accounts pages under usual vm management.) 152are not accounted. We just account pages under usual VM management.
117 153
118RSS pages are accounted at page_fault unless they've already been accounted 154RSS pages are accounted at page_fault unless they've already been accounted
119for earlier. A file page will be accounted for as Page Cache when it's 155for earlier. A file page will be accounted for as Page Cache when it's
@@ -121,12 +157,19 @@ inserted into inode (radix-tree). While it's mapped into the page tables of
121processes, duplicate accounting is carefully avoided. 157processes, duplicate accounting is carefully avoided.
122 158
123A RSS page is unaccounted when it's fully unmapped. A PageCache page is 159A RSS page is unaccounted when it's fully unmapped. A PageCache page is
124unaccounted when it's removed from radix-tree. 160unaccounted when it's removed from radix-tree. Even if RSS pages are fully
161unmapped (by kswapd), they may exist as SwapCache in the system until they
162are really freed. Such SwapCaches also also accounted.
163A swapped-in page is not accounted until it's mapped.
164
165Note: The kernel does swapin-readahead and read multiple swaps at once.
166This means swapped-in pages may contain pages for other tasks than a task
167causing page fault. So, we avoid accounting at swap-in I/O.
125 168
126At page migration, accounting information is kept. 169At page migration, accounting information is kept.
127 170
128Note: we just account pages-on-lru because our purpose is to control amount 171Note: we just account pages-on-LRU because our purpose is to control amount
129of used pages. not-on-lru pages are tend to be out-of-control from vm view. 172of used pages; not-on-LRU pages tend to be out-of-control from VM view.
130 173
1312.3 Shared Page Accounting 1742.3 Shared Page Accounting
132 175
@@ -143,6 +186,7 @@ caller of swapoff rather than the users of shmem.
143 186
144 187
1452.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) 1882.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
189
146Swap Extension allows you to record charge for swap. A swapped-in page is 190Swap Extension allows you to record charge for swap. A swapped-in page is
147charged back to original page allocator if possible. 191charged back to original page allocator if possible.
148 192
@@ -150,13 +194,20 @@ When swap is accounted, following files are added.
150 - memory.memsw.usage_in_bytes. 194 - memory.memsw.usage_in_bytes.
151 - memory.memsw.limit_in_bytes. 195 - memory.memsw.limit_in_bytes.
152 196
153usage of mem+swap is limited by memsw.limit_in_bytes. 197memsw means memory+swap. Usage of memory+swap is limited by
198memsw.limit_in_bytes.
154 199
155* why 'mem+swap' rather than swap. 200Example: Assume a system with 4G of swap. A task which allocates 6G of memory
201(by mistake) under 2G memory limitation will use all swap.
202In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
203By using memsw limit, you can avoid system OOM which can be caused by swap
204shortage.
205
206* why 'memory+swap' rather than swap.
156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means 207The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
157to move account from memory to swap...there is no change in usage of 208to move account from memory to swap...there is no change in usage of
158mem+swap. In other words, when we want to limit the usage of swap without 209memory+swap. In other words, when we want to limit the usage of swap without
159affecting global LRU, mem+swap limit is better than just limiting swap from 210affecting global LRU, memory+swap limit is better than just limiting swap from
160OS point of view. 211OS point of view.
161 212
162* What happens when a cgroup hits memory.memsw.limit_in_bytes 213* What happens when a cgroup hits memory.memsw.limit_in_bytes
@@ -168,12 +219,12 @@ it by cgroup.
168 219
1692.5 Reclaim 2202.5 Reclaim
170 221
171Each cgroup maintains a per cgroup LRU that consists of an active 222Each cgroup maintains a per cgroup LRU which has the same structure as
172and inactive list. When a cgroup goes over its limit, we first try 223global VM. When a cgroup goes over its limit, we first try
173to reclaim memory from the cgroup so as to make space for the new 224to reclaim memory from the cgroup so as to make space for the new
174pages that the cgroup has touched. If the reclaim is unsuccessful, 225pages that the cgroup has touched. If the reclaim is unsuccessful,
175an OOM routine is invoked to select and kill the bulkiest task in the 226an OOM routine is invoked to select and kill the bulkiest task in the
176cgroup. 227cgroup. (See 10. OOM Control below.)
177 228
178The reclaim algorithm has not been modified for cgroups, except that 229The reclaim algorithm has not been modified for cgroups, except that
179pages that are selected for reclaiming come from the per cgroup LRU 230pages that are selected for reclaiming come from the per cgroup LRU
@@ -182,13 +233,24 @@ list.
182NOTE: Reclaim does not work for the root cgroup, since we cannot set any 233NOTE: Reclaim does not work for the root cgroup, since we cannot set any
183limits on the root cgroup. 234limits on the root cgroup.
184 235
1852. Locking 236Note2: When panic_on_oom is set to "2", the whole system will panic.
237
238When oom event notifier is registered, event will be delivered.
239(See oom_control section)
240
2412.6 Locking
186 242
187The memory controller uses the following hierarchy 243 lock_page_cgroup()/unlock_page_cgroup() should not be called under
244 mapping->tree_lock.
188 245
1891. zone->lru_lock is used for selecting pages to be isolated 246 Other lock order is following:
1902. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) 247 PG_locked.
1913. lock_page_cgroup() is used to protect page->page_cgroup 248 mm->page_table_lock
249 zone->lru_lock
250 lock_page_cgroup.
251 In many cases, just lock_page_cgroup() is called.
252 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
253 zone->lru_lock, it has no lock of its own.
192 254
1933. User Interface 2553. User Interface
194 256
@@ -197,6 +259,7 @@ The memory controller uses the following hierarchy
197a. Enable CONFIG_CGROUPS 259a. Enable CONFIG_CGROUPS
198b. Enable CONFIG_RESOURCE_COUNTERS 260b. Enable CONFIG_RESOURCE_COUNTERS
199c. Enable CONFIG_CGROUP_MEM_RES_CTLR 261c. Enable CONFIG_CGROUP_MEM_RES_CTLR
262d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension)
200 263
2011. Prepare the cgroups 2641. Prepare the cgroups
202# mkdir -p /cgroups 265# mkdir -p /cgroups
@@ -204,31 +267,28 @@ c. Enable CONFIG_CGROUP_MEM_RES_CTLR
204 267
2052. Make the new group and move bash into it 2682. Make the new group and move bash into it
206# mkdir /cgroups/0 269# mkdir /cgroups/0
207# echo $$ > /cgroups/0/tasks 270# echo $$ > /cgroups/0/tasks
208 271
209Since now we're in the 0 cgroup, 272Since now we're in the 0 cgroup, we can alter the memory limit:
210We can alter the memory limit:
211# echo 4M > /cgroups/0/memory.limit_in_bytes 273# echo 4M > /cgroups/0/memory.limit_in_bytes
212 274
213NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 275NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
214mega or gigabytes. 276mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
277
215NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 278NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
216NOTE: We cannot set limits on the root cgroup any more. 279NOTE: We cannot set limits on the root cgroup any more.
217 280
218# cat /cgroups/0/memory.limit_in_bytes 281# cat /cgroups/0/memory.limit_in_bytes
2194194304 2824194304
220 283
221NOTE: The interface has now changed to display the usage in bytes
222instead of pages
223
224We can check the usage: 284We can check the usage:
225# cat /cgroups/0/memory.usage_in_bytes 285# cat /cgroups/0/memory.usage_in_bytes
2261216512 2861216512
227 287
228A successful write to this file does not guarantee a successful set of 288A successful write to this file does not guarantee a successful set of
229this limit to the value written into the file. This can be due to a 289this limit to the value written into the file. This can be due to a
230number of factors, such as rounding up to page boundaries or the total 290number of factors, such as rounding up to page boundaries or the total
231availability of memory on the system. The user is required to re-read 291availability of memory on the system. The user is required to re-read
232this file after a write to guarantee the value committed by the kernel. 292this file after a write to guarantee the value committed by the kernel.
233 293
234# echo 1 > memory.limit_in_bytes 294# echo 1 > memory.limit_in_bytes
@@ -243,15 +303,23 @@ caches, RSS and Active pages/Inactive pages are shown.
243 303
2444. Testing 3044. Testing
245 305
246Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. 306For testing features and implementation, see memcg_test.txt.
247Apart from that v6 has been tested with several applications and regular 307
248daily use. The controller has also been tested on the PPC64, x86_64 and 308Performance test is also important. To see pure memory controller's overhead,
249UML platforms. 309testing on tmpfs will give you good numbers of small overheads.
310Example: do kernel make on tmpfs.
311
312Page-fault scalability is also important. At measuring parallel
313page fault test, multi-process test may be better than multi-thread
314test because it has noise of shared objects/status.
315
316But the above two are testing extreme situations.
317Trying usual test under memory controller is always helpful.
250 318
2514.1 Troubleshooting 3194.1 Troubleshooting
252 320
253Sometimes a user might find that the application under a cgroup is 321Sometimes a user might find that the application under a cgroup is
254terminated. There are several causes for this: 322terminated by OOM killer. There are several causes for this:
255 323
2561. The cgroup limit is too low (just too low to do anything useful) 3241. The cgroup limit is too low (just too low to do anything useful)
2572. The user is using anonymous memory and swap is turned off or too low 3252. The user is using anonymous memory and swap is turned off or too low
@@ -259,21 +327,29 @@ terminated. There are several causes for this:
259A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of 327A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
260some of the pages cached in the cgroup (page cache pages). 328some of the pages cached in the cgroup (page cache pages).
261 329
330To know what happens, disable OOM_Kill by 10. OOM Control(see below) and
331seeing what happens will be helpful.
332
2624.2 Task migration 3334.2 Task migration
263 334
264When a task migrates from one cgroup to another, it's charge is not 335When a task migrates from one cgroup to another, its charge is not
265carried forward. The pages allocated from the original cgroup still 336carried forward by default. The pages allocated from the original cgroup still
266remain charged to it, the charge is dropped when the page is freed or 337remain charged to it, the charge is dropped when the page is freed or
267reclaimed. 338reclaimed.
268 339
340You can move charges of a task along with task migration.
341See 8. "Move charges at task migration"
342
2694.3 Removing a cgroup 3434.3 Removing a cgroup
270 344
271A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a 345A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
272cgroup might have some charge associated with it, even though all 346cgroup might have some charge associated with it, even though all
273tasks have migrated away from it. 347tasks have migrated away from it. (because we charge against pages, not
274Such charges are freed(at default) or moved to its parent. When moved, 348against tasks.)
275both of RSS and CACHES are moved to parent. 349
276If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. 350Such charges are freed or moved to their parent. At moving, both of RSS
351and CACHES are moved to parent.
352rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also.
277 353
278Charges recorded in swap information is not updated at removal of cgroup. 354Charges recorded in swap information is not updated at removal of cgroup.
279Recorded information is discarded and a cgroup which uses swap (swapcache) 355Recorded information is discarded and a cgroup which uses swap (swapcache)
@@ -289,10 +365,10 @@ will be charged as a new owner of it.
289 365
290 # echo 0 > memory.force_empty 366 # echo 0 > memory.force_empty
291 367
292 Almost all pages tracked by this memcg will be unmapped and freed. Some of 368 Almost all pages tracked by this memory cgroup will be unmapped and freed.
293 pages cannot be freed because it's locked or in-use. Such pages are moved 369 Some pages cannot be freed because they are locked or in-use. Such pages are
294 to parent and this cgroup will be empty. But this may return -EBUSY in 370 moved to parent and this cgroup will be empty. This may return -EBUSY if
295 some too busy case. 371 VM is too busy to free/move all pages immediately.
296 372
297 Typical use case of this interface is that calling this before rmdir(). 373 Typical use case of this interface is that calling this before rmdir().
298 Because rmdir() moves all pages to parent, some out-of-use page caches can be 374 Because rmdir() moves all pages to parent, some out-of-use page caches can be
@@ -302,19 +378,41 @@ will be charged as a new owner of it.
302 378
303memory.stat file includes following statistics 379memory.stat file includes following statistics
304 380
381# per-memory cgroup local status
305cache - # of bytes of page cache memory. 382cache - # of bytes of page cache memory.
306rss - # of bytes of anonymous and swap cache memory. 383rss - # of bytes of anonymous and swap cache memory.
384mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
307pgpgin - # of pages paged in (equivalent to # of charging events). 385pgpgin - # of pages paged in (equivalent to # of charging events).
308pgpgout - # of pages paged out (equivalent to # of uncharging events). 386pgpgout - # of pages paged out (equivalent to # of uncharging events).
309active_anon - # of bytes of anonymous and swap cache memory on active 387swap - # of bytes of swap usage
310 lru list.
311inactive_anon - # of bytes of anonymous memory and swap cache memory on 388inactive_anon - # of bytes of anonymous memory and swap cache memory on
312 inactive lru list. 389 LRU list.
313active_file - # of bytes of file-backed memory on active lru list. 390active_anon - # of bytes of anonymous and swap cache memory on active
314inactive_file - # of bytes of file-backed memory on inactive lru list. 391 inactive LRU list.
392inactive_file - # of bytes of file-backed memory on inactive LRU list.
393active_file - # of bytes of file-backed memory on active LRU list.
315unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). 394unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
316 395
317The following additional stats are dependent on CONFIG_DEBUG_VM. 396# status considering hierarchy (see memory.use_hierarchy settings)
397
398hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
399 under which the memory cgroup is
400hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
401 hierarchy under which memory cgroup is.
402
403total_cache - sum of all children's "cache"
404total_rss - sum of all children's "rss"
405total_mapped_file - sum of all children's "cache"
406total_pgpgin - sum of all children's "pgpgin"
407total_pgpgout - sum of all children's "pgpgout"
408total_swap - sum of all children's "swap"
409total_inactive_anon - sum of all children's "inactive_anon"
410total_active_anon - sum of all children's "active_anon"
411total_inactive_file - sum of all children's "inactive_file"
412total_active_file - sum of all children's "active_file"
413total_unevictable - sum of all children's "unevictable"
414
415# The following additional stats are dependent on CONFIG_DEBUG_VM.
318 416
319inactive_ratio - VM internal parameter. (see mm/page_alloc.c) 417inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
320recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 418recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
@@ -323,24 +421,37 @@ recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
323recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 421recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
324 422
325Memo: 423Memo:
326 recent_rotated means recent frequency of lru rotation. 424 recent_rotated means recent frequency of LRU rotation.
327 recent_scanned means recent # of scans to lru. 425 recent_scanned means recent # of scans to LRU.
328 showing for better debug please see the code for meanings. 426 showing for better debug please see the code for meanings.
329 427
330Note: 428Note:
331 Only anonymous and swap cache memory is listed as part of 'rss' stat. 429 Only anonymous and swap cache memory is listed as part of 'rss' stat.
332 This should not be confused with the true 'resident set size' or the 430 This should not be confused with the true 'resident set size' or the
333 amount of physical memory used by the cgroup. Per-cgroup rss 431 amount of physical memory used by the cgroup.
334 accounting is not done yet. 432 'rss + file_mapped" will give you resident set size of cgroup.
433 (Note: file and shmem may be shared among other cgroups. In that case,
434 file_mapped is accounted only when the memory cgroup is owner of page
435 cache.)
335 436
3365.3 swappiness 4375.3 swappiness
337 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
338 438
339 Following cgroups' swapiness can't be changed. 439Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
340 - root cgroup (uses /proc/sys/vm/swappiness). 440
341 - a cgroup which uses hierarchy and it has child cgroup. 441Following cgroups' swappiness can't be changed.
342 - a cgroup which uses hierarchy and not the root of hierarchy. 442- root cgroup (uses /proc/sys/vm/swappiness).
443- a cgroup which uses hierarchy and it has other cgroup(s) below it.
444- a cgroup which uses hierarchy and not the root of hierarchy.
445
4465.4 failcnt
343 447
448A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
449This failcnt(== failure count) shows the number of times that a usage counter
450hit its limit. When a memory cgroup hits a limit, failcnt increases and
451memory under it will be reclaimed.
452
453You can reset failcnt by writing 0 to failcnt file.
454# echo 0 > .../memory.failcnt
344 455
3456. Hierarchy support 4566. Hierarchy support
346 457
@@ -359,13 +470,13 @@ hierarchy
359 470
360In the diagram above, with hierarchical accounting enabled, all memory 471In the diagram above, with hierarchical accounting enabled, all memory
361usage of e, is accounted to its ancestors up until the root (i.e, c and root), 472usage of e, is accounted to its ancestors up until the root (i.e, c and root),
362that has memory.use_hierarchy enabled. If one of the ancestors goes over its 473that has memory.use_hierarchy enabled. If one of the ancestors goes over its
363limit, the reclaim algorithm reclaims from the tasks in the ancestor and the 474limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
364children of the ancestor. 475children of the ancestor.
365 476
3666.1 Enabling hierarchical accounting and reclaim 4776.1 Enabling hierarchical accounting and reclaim
367 478
368The memory controller by default disables the hierarchy feature. Support 479A memory cgroup by default disables the hierarchy feature. Support
369can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup 480can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
370 481
371# echo 1 > memory.use_hierarchy 482# echo 1 > memory.use_hierarchy
@@ -375,9 +486,10 @@ The feature can be disabled by
375# echo 0 > memory.use_hierarchy 486# echo 0 > memory.use_hierarchy
376 487
377NOTE1: Enabling/disabling will fail if the cgroup already has other 488NOTE1: Enabling/disabling will fail if the cgroup already has other
378cgroups created below it. 489 cgroups created below it.
379 490
380NOTE2: This feature can be enabled/disabled per subtree. 491NOTE2: When panic_on_oom is set to "2", the whole system will panic in
492 case of an OOM event in any cgroup.
381 493
3827. Soft limits 4947. Soft limits
383 495
@@ -387,7 +499,7 @@ is to allow control groups to use as much of the memory as needed, provided
387a. There is no memory contention 499a. There is no memory contention
388b. They do not exceed their hard limit 500b. They do not exceed their hard limit
389 501
390When the system detects memory contention or low memory control groups 502When the system detects memory contention or low memory, control groups
391are pushed back to their soft limits. If the soft limit of each control 503are pushed back to their soft limits. If the soft limit of each control
392group is very high, they are pushed back as much as possible to make 504group is very high, they are pushed back as much as possible to make
393sure that one control group does not starve the others of memory. 505sure that one control group does not starve the others of memory.
@@ -401,7 +513,7 @@ it gets invoked from balance_pgdat (kswapd).
4017.1 Interface 5137.1 Interface
402 514
403Soft limits can be setup by using the following commands (in this example we 515Soft limits can be setup by using the following commands (in this example we
404assume a soft limit of 256 megabytes) 516assume a soft limit of 256 MiB)
405 517
406# echo 256M > memory.soft_limit_in_bytes 518# echo 256M > memory.soft_limit_in_bytes
407 519
@@ -414,7 +526,121 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
414NOTE2: It is recommended to set the soft limit always below the hard limit, 526NOTE2: It is recommended to set the soft limit always below the hard limit,
415 otherwise the hard limit will take precedence. 527 otherwise the hard limit will take precedence.
416 528
4178. TODO 5298. Move charges at task migration
530
531Users can move charges associated with a task along with task migration, that
532is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
533This feature is not supported in !CONFIG_MMU environments because of lack of
534page tables.
535
5368.1 Interface
537
538This feature is disabled by default. It can be enabled(and disabled again) by
539writing to memory.move_charge_at_immigrate of the destination cgroup.
540
541If you want to enable it:
542
543# echo (some positive value) > memory.move_charge_at_immigrate
544
545Note: Each bits of move_charge_at_immigrate has its own meaning about what type
546 of charges should be moved. See 8.2 for details.
547Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
548 group.
549Note: If we cannot find enough space for the task in the destination cgroup, we
550 try to make space by reclaiming memory. Task migration may fail if we
551 cannot make enough space.
552Note: It can take several seconds if you move charges much.
553
554And if you want disable it again:
555
556# echo 0 > memory.move_charge_at_immigrate
557
5588.2 Type of charges which can be move
559
560Each bits of move_charge_at_immigrate has its own meaning about what type of
561charges should be moved. But in any cases, it must be noted that an account of
562a page or a swap can be moved only when it is charged to the task's current(old)
563memory cgroup.
564
565 bit | what type of charges would be moved ?
566 -----+------------------------------------------------------------------------
567 0 | A charge of an anonymous page(or swap of it) used by the target task.
568 | Those pages and swaps must be used only by the target task. You must
569 | enable Swap Extension(see 2.4) to enable move of swap charges.
570 -----+------------------------------------------------------------------------
571 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
572 | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
573 | anonymous pages, file pages(and swaps) in the range mmapped by the task
574 | will be moved even if the task hasn't done page fault, i.e. they might
575 | not be the task's "RSS", but other task's "RSS" that maps the same file.
576 | And mapcount of the page is ignored(the page can be moved even if
577 | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to
578 | enable move of swap charges.
579
5808.3 TODO
581
582- Implement madvise(2) to let users decide the vma to be moved or not to be
583 moved.
584- All of moving charge operations are done under cgroup_mutex. It's not good
585 behavior to hold the mutex too long, so we may need some trick.
586
5879. Memory thresholds
588
589Memory cgroup implements memory thresholds using cgroups notification
590API (see cgroups.txt). It allows to register multiple memory and memsw
591thresholds and gets notifications when it crosses.
592
593To register a threshold application need:
594- create an eventfd using eventfd(2);
595- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
596- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
597 cgroup.event_control.
598
599Application will be notified through eventfd when memory usage crosses
600threshold in any direction.
601
602It's applicable for root and non-root cgroup.
603
60410. OOM Control
605
606memory.oom_control file is for OOM notification and other controls.
607
608Memory cgroup implements OOM notifier using cgroup notification
609API (See cgroups.txt). It allows to register multiple OOM notification
610delivery and gets notification when OOM happens.
611
612To register a notifier, application need:
613 - create an eventfd using eventfd(2)
614 - open memory.oom_control file
615 - write string like "<event_fd> <fd of memory.oom_control>" to
616 cgroup.event_control
617
618Application will be notified through eventfd when OOM happens.
619OOM notification doesn't work for root cgroup.
620
621You can disable OOM-killer by writing "1" to memory.oom_control file, as:
622
623 #echo 1 > memory.oom_control
624
625This operation is only allowed to the top cgroup of sub-hierarchy.
626If OOM-killer is disabled, tasks under cgroup will hang/sleep
627in memory cgroup's OOM-waitqueue when they request accountable memory.
628
629For running them, you have to relax the memory cgroup's OOM status by
630 * enlarge limit or reduce usage.
631To reduce usage,
632 * kill some tasks.
633 * move some tasks to other group with account migration.
634 * remove some files (on tmpfs?)
635
636Then, stopped tasks will work again.
637
638At reading, current status of OOM is shown.
639 oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
640 under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
641 be stopped.)
642
64311. TODO
418 644
4191. Add support for accounting huge pages (as a separate controller) 6451. Add support for accounting huge pages (as a separate controller)
4202. Make per-cgroup scanner reclaim not-shared pages first 6462. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt
new file mode 100644
index 000000000000..8117e5bf6065
--- /dev/null
+++ b/Documentation/circular-buffers.txt
@@ -0,0 +1,234 @@
1 ================
2 CIRCULAR BUFFERS
3 ================
4
5By: David Howells <dhowells@redhat.com>
6 Paul E. McKenney <paulmck@linux.vnet.ibm.com>
7
8
9Linux provides a number of features that can be used to implement circular
10buffering. There are two sets of such features:
11
12 (1) Convenience functions for determining information about power-of-2 sized
13 buffers.
14
15 (2) Memory barriers for when the producer and the consumer of objects in the
16 buffer don't want to share a lock.
17
18To use these facilities, as discussed below, there needs to be just one
19producer and just one consumer. It is possible to handle multiple producers by
20serialising them, and to handle multiple consumers by serialising them.
21
22
23Contents:
24
25 (*) What is a circular buffer?
26
27 (*) Measuring power-of-2 buffers.
28
29 (*) Using memory barriers with circular buffers.
30 - The producer.
31 - The consumer.
32
33
34==========================
35WHAT IS A CIRCULAR BUFFER?
36==========================
37
38First of all, what is a circular buffer? A circular buffer is a buffer of
39fixed, finite size into which there are two indices:
40
41 (1) A 'head' index - the point at which the producer inserts items into the
42 buffer.
43
44 (2) A 'tail' index - the point at which the consumer finds the next item in
45 the buffer.
46
47Typically when the tail pointer is equal to the head pointer, the buffer is
48empty; and the buffer is full when the head pointer is one less than the tail
49pointer.
50
51The head index is incremented when items are added, and the tail index when
52items are removed. The tail index should never jump the head index, and both
53indices should be wrapped to 0 when they reach the end of the buffer, thus
54allowing an infinite amount of data to flow through the buffer.
55
56Typically, items will all be of the same unit size, but this isn't strictly
57required to use the techniques below. The indices can be increased by more
58than 1 if multiple items or variable-sized items are to be included in the
59buffer, provided that neither index overtakes the other. The implementer must
60be careful, however, as a region more than one unit in size may wrap the end of
61the buffer and be broken into two segments.
62
63
64============================
65MEASURING POWER-OF-2 BUFFERS
66============================
67
68Calculation of the occupancy or the remaining capacity of an arbitrarily sized
69circular buffer would normally be a slow operation, requiring the use of a
70modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
71then a much quicker bitwise-AND instruction can be used instead.
72
73Linux provides a set of macros for handling power-of-2 circular buffers. These
74can be made use of by:
75
76 #include <linux/circ_buf.h>
77
78The macros are:
79
80 (*) Measure the remaining capacity of a buffer:
81
82 CIRC_SPACE(head_index, tail_index, buffer_size);
83
84 This returns the amount of space left in the buffer[1] into which items
85 can be inserted.
86
87
88 (*) Measure the maximum consecutive immediate space in a buffer:
89
90 CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
91
92 This returns the amount of consecutive space left in the buffer[1] into
93 which items can be immediately inserted without having to wrap back to the
94 beginning of the buffer.
95
96
97 (*) Measure the occupancy of a buffer:
98
99 CIRC_CNT(head_index, tail_index, buffer_size);
100
101 This returns the number of items currently occupying a buffer[2].
102
103
104 (*) Measure the non-wrapping occupancy of a buffer:
105
106 CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
107
108 This returns the number of consecutive items[2] that can be extracted from
109 the buffer without having to wrap back to the beginning of the buffer.
110
111
112Each of these macros will nominally return a value between 0 and buffer_size-1,
113however:
114
115 [1] CIRC_SPACE*() are intended to be used in the producer. To the producer
116 they will return a lower bound as the producer controls the head index,
117 but the consumer may still be depleting the buffer on another CPU and
118 moving the tail index.
119
120 To the consumer it will show an upper bound as the producer may be busy
121 depleting the space.
122
123 [2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
124 will return a lower bound as the consumer controls the tail index, but the
125 producer may still be filling the buffer on another CPU and moving the
126 head index.
127
128 To the producer it will show an upper bound as the consumer may be busy
129 emptying the buffer.
130
131 [3] To a third party, the order in which the writes to the indices by the
132 producer and consumer become visible cannot be guaranteed as they are
133 independent and may be made on different CPUs - so the result in such a
134 situation will merely be a guess, and may even be negative.
135
136
137===========================================
138USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
139===========================================
140
141By using memory barriers in conjunction with circular buffers, you can avoid
142the need to:
143
144 (1) use a single lock to govern access to both ends of the buffer, thus
145 allowing the buffer to be filled and emptied at the same time; and
146
147 (2) use atomic counter operations.
148
149There are two sides to this: the producer that fills the buffer, and the
150consumer that empties it. Only one thing should be filling a buffer at any one
151time, and only one thing should be emptying a buffer at any one time, but the
152two sides can operate simultaneously.
153
154
155THE PRODUCER
156------------
157
158The producer will look something like this:
159
160 spin_lock(&producer_lock);
161
162 unsigned long head = buffer->head;
163 unsigned long tail = ACCESS_ONCE(buffer->tail);
164
165 if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
166 /* insert one item into the buffer */
167 struct item *item = buffer[head];
168
169 produce_item(item);
170
171 smp_wmb(); /* commit the item before incrementing the head */
172
173 buffer->head = (head + 1) & (buffer->size - 1);
174
175 /* wake_up() will make sure that the head is committed before
176 * waking anyone up */
177 wake_up(consumer);
178 }
179
180 spin_unlock(&producer_lock);
181
182This will instruct the CPU that the contents of the new item must be written
183before the head index makes it available to the consumer and then instructs the
184CPU that the revised head index must be written before the consumer is woken.
185
186Note that wake_up() doesn't have to be the exact mechanism used, but whatever
187is used must guarantee a (write) memory barrier between the update of the head
188index and the change of state of the consumer, if a change of state occurs.
189
190
191THE CONSUMER
192------------
193
194The consumer will look something like this:
195
196 spin_lock(&consumer_lock);
197
198 unsigned long head = ACCESS_ONCE(buffer->head);
199 unsigned long tail = buffer->tail;
200
201 if (CIRC_CNT(head, tail, buffer->size) >= 1) {
202 /* read index before reading contents at that index */
203 smp_read_barrier_depends();
204
205 /* extract one item from the buffer */
206 struct item *item = buffer[tail];
207
208 consume_item(item);
209
210 smp_mb(); /* finish reading descriptor before incrementing tail */
211
212 buffer->tail = (tail + 1) & (buffer->size - 1);
213 }
214
215 spin_unlock(&consumer_lock);
216
217This will instruct the CPU to make sure the index is up to date before reading
218the new item, and then it shall make sure the CPU has finished reading the item
219before it writes the new tail pointer, which will erase the item.
220
221
222Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
223This prevents the compiler from discarding and reloading its cached value -
224which some compilers will do across smp_read_barrier_depends(). This isn't
225strictly needed if you can be sure that the opposition index will _only_ be
226used the once.
227
228
229===============
230FURTHER READING
231===============
232
233See also Documentation/memory-barriers.txt for a description of Linux's memory
234barrier facilities.
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c
index b07add3467f1..7764594778d4 100644
--- a/Documentation/connector/cn_test.c
+++ b/Documentation/connector/cn_test.c
@@ -25,6 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
27#include <linux/skbuff.h> 27#include <linux/skbuff.h>
28#include <linux/slab.h>
28#include <linux/timer.h> 29#include <linux/timer.h>
29 30
30#include <linux/connector.h> 31#include <linux/connector.h>
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
index 78c9466a9aa8..e5c5f5e6ab70 100644
--- a/Documentation/connector/connector.txt
+++ b/Documentation/connector/connector.txt
@@ -88,7 +88,7 @@ int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
88 int gfp_mask - GFP mask. 88 int gfp_mask - GFP mask.
89 89
90 Note: When registering new callback user, connector core assigns 90 Note: When registering new callback user, connector core assigns
91 netlink group to the user which is equal to it's id.idx. 91 netlink group to the user which is equal to its id.idx.
92 92
93/*****************************************/ 93/*****************************************/
94Protocol description. 94Protocol description.
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt
index 877a1b26cc3d..926cf1b5e63e 100644
--- a/Documentation/console/console.txt
+++ b/Documentation/console/console.txt
@@ -74,7 +74,7 @@ driver takes over the consoles vacated by the driver. Binding, on the other
74hand, will bind the driver to the consoles that are currently occupied by a 74hand, will bind the driver to the consoles that are currently occupied by a
75system driver. 75system driver.
76 76
77NOTE1: Binding and binding must be selected in Kconfig. It's under: 77NOTE1: Binding and unbinding must be selected in Kconfig. It's under:
78 78
79Device Drivers -> Character devices -> Support for binding and unbinding 79Device Drivers -> Character devices -> Support for binding and unbinding
80console drivers 80console drivers
diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt
new file mode 100644
index 000000000000..9e3c3b33514c
--- /dev/null
+++ b/Documentation/cpu-freq/pcc-cpufreq.txt
@@ -0,0 +1,207 @@
1/*
2 * pcc-cpufreq.txt - PCC interface documentation
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26
27 Processor Clocking Control Driver
28 ---------------------------------
29
30Contents:
31---------
321. Introduction
331.1 PCC interface
341.1.1 Get Average Frequency
351.1.2 Set Desired Frequency
361.2 Platforms affected
372. Driver and /sys details
382.1 scaling_available_frequencies
392.2 cpuinfo_transition_latency
402.3 cpuinfo_cur_freq
412.4 related_cpus
423. Caveats
43
441. Introduction:
45----------------
46Processor Clocking Control (PCC) is an interface between the platform
47firmware and OSPM. It is a mechanism for coordinating processor
48performance (ie: frequency) between the platform firmware and the OS.
49
50The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC
51interface.
52
53OS utilizes the PCC interface to inform platform firmware what frequency the
54OS wants for a logical processor. The platform firmware attempts to achieve
55the requested frequency. If the request for the target frequency could not be
56satisfied by platform firmware, then it usually means that power budget
57conditions are in place, and "power capping" is taking place.
58
591.1 PCC interface:
60------------------
61The complete PCC specification is available here:
62http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf
63
64PCC relies on a shared memory region that provides a channel for communication
65between the OS and platform firmware. PCC also implements a "doorbell" that
66is used by the OS to inform the platform firmware that a command has been
67sent.
68
69The ACPI PCCH() method is used to discover the location of the PCC shared
70memory region. The shared memory region header contains the "command" and
71"status" interface. PCCH() also contains details on how to access the platform
72doorbell.
73
74The following commands are supported by the PCC interface:
75* Get Average Frequency
76* Set Desired Frequency
77
78The ACPI PCCP() method is implemented for each logical processor and is
79used to discover the offsets for the input and output buffers in the shared
80memory region.
81
82When PCC mode is enabled, the platform will not expose processor performance
83or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore,
84the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for
85AMD) will not load.
86
87However, OSPM remains in control of policy. The governor (eg: "ondemand")
88computes the required performance for each processor based on server workload.
89The PCC driver fills in the command interface, and the input buffer and
90communicates the request to the platform firmware. The platform firmware is
91responsible for delivering the requested performance.
92
93Each PCC command is "global" in scope and can affect all the logical CPUs in
94the system. Therefore, PCC is capable of performing "group" updates. With PCC
95the OS is capable of getting/setting the frequency of all the logical CPUs in
96the system with a single call to the BIOS.
97
981.1.1 Get Average Frequency:
99----------------------------
100This command is used by the OSPM to query the running frequency of the
101processor since the last time this command was completed. The output buffer
102indicates the average unhalted frequency of the logical processor expressed as
103a percentage of the nominal (ie: maximum) CPU frequency. The output buffer
104also signifies if the CPU frequency is limited by a power budget condition.
105
1061.1.2 Set Desired Frequency:
107----------------------------
108This command is used by the OSPM to communicate to the platform firmware the
109desired frequency for a logical processor. The output buffer is currently
110ignored by OSPM. The next invocation of "Get Average Frequency" will inform
111OSPM if the desired frequency was achieved or not.
112
1131.2 Platforms affected:
114-----------------------
115The PCC driver will load on any system where the platform firmware:
116* supports the PCC interface, and the associated PCCH() and PCCP() methods
117* assumes responsibility for managing the hardware clocking controls in order
118to deliver the requested processor performance
119
120Currently, certain HP ProLiant platforms implement the PCC interface. On those
121platforms PCC is the "default" choice.
122
123However, it is possible to disable this interface via a BIOS setting. In
124such an instance, as is also the case on platforms where the PCC interface
125is not implemented, the PCC driver will fail to load silently.
126
1272. Driver and /sys details:
128---------------------------
129When the driver loads, it merely prints the lowest and the highest CPU
130frequencies supported by the platform firmware.
131
132The PCC driver loads with a message such as:
133pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933
134MHz
135
136This means that the OPSM can request the CPU to run at any frequency in
137between the limits (1600 MHz, and 2933 MHz) specified in the message.
138
139Internally, there is no need for the driver to convert the "target" frequency
140to a corresponding P-state.
141
142The VERSION number for the driver will be of the format v.xy.ab.
143eg: 1.00.02
144 ----- --
145 | |
146 | -- this will increase with bug fixes/enhancements to the driver
147 |-- this is the version of the PCC specification the driver adheres to
148
149
150The following is a brief discussion on some of the fields exported via the
151/sys filesystem and how their values are affected by the PCC driver:
152
1532.1 scaling_available_frequencies:
154----------------------------------
155scaling_available_frequencies is not created in /sys. No intermediate
156frequencies need to be listed because the BIOS will try to achieve any
157frequency, within limits, requested by the governor. A frequency does not have
158to be strictly associated with a P-state.
159
1602.2 cpuinfo_transition_latency:
161-------------------------------
162The cpuinfo_transition_latency field is 0. The PCC specification does
163not include a field to expose this value currently.
164
1652.3 cpuinfo_cur_freq:
166---------------------
167A) Often cpuinfo_cur_freq will show a value different than what is declared
168in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq.
169This is due to "turbo boost" available on recent Intel processors. If certain
170conditions are met the BIOS can achieve a slightly higher speed than requested
171by OSPM. An example:
172
173scaling_cur_freq : 2933000
174cpuinfo_cur_freq : 3196000
175
176B) There is a round-off error associated with the cpuinfo_cur_freq value.
177Since the driver obtains the current frequency as a "percentage" (%) of the
178nominal frequency from the BIOS, sometimes, the values displayed by
179scaling_cur_freq and cpuinfo_cur_freq may not match. An example:
180
181scaling_cur_freq : 1600000
182cpuinfo_cur_freq : 1583000
183
184In this example, the nominal frequency is 2933 MHz. The driver obtains the
185current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency:
186
187 54% of 2933 MHz = 1583 MHz
188
189Nominal frequency is the maximum frequency of the processor, and it usually
190corresponds to the frequency of the P0 P-state.
191
1922.4 related_cpus:
193-----------------
194The related_cpus field is identical to affected_cpus.
195
196affected_cpus : 4
197related_cpus : 4
198
199Currently, the PCC driver does not evaluate _PSD. The platforms that support
200PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination
201to ensure that the same frequency is requested of all dependent CPUs.
202
2033. Caveats:
204-----------
205The "cpufreq_stats" module in its present form cannot be loaded and
206expected to work with the PCC driver. Since the "cpufreq_stats" module
207provides information wrt each P-state, it is not applicable to the PCC driver.
diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt
index df03169782ea..a2db35287003 100644
--- a/Documentation/credentials.txt
+++ b/Documentation/credentials.txt
@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example:
408 ... 408 ...
409 } 409 }
410 410
411A function need not get RCU read lock to use __task_cred() if it is holding a
412spinlock at the time as this implicitly holds the RCU read lock.
413
414Should it be necessary to hold another task's credentials for a long period of 411Should it be necessary to hold another task's credentials for a long period of
415time, and possibly to sleep whilst doing so, then the caller should get a 412time, and possibly to sleep whilst doing so, then the caller should get a
416reference on them using: 413reference on them using:
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller:
426 uid_t task_uid(task) Task's real UID 423 uid_t task_uid(task) Task's real UID
427 uid_t task_euid(task) Task's effective UID 424 uid_t task_euid(task) Task's effective UID
428 425
429If the caller is holding a spinlock or the RCU read lock at the time anyway, 426If the caller is holding the RCU read lock at the time anyway, then:
430then:
431 427
432 __task_cred(task)->uid 428 __task_cred(task)->uid
433 __task_cred(task)->euid 429 __task_cred(task)->euid
434 430
435should be used instead. Similarly, if multiple aspects of a task's credentials 431should be used instead. Similarly, if multiple aspects of a task's credentials
436need to be accessed, RCU read lock or a spinlock should be used, __task_cred() 432need to be accessed, RCU read lock should be used, __task_cred() called, the
437called, the result stored in a temporary pointer and then the credential 433result stored in a temporary pointer and then the credential aspects called
438aspects called from that before dropping the lock. This prevents the 434from that before dropping the lock. This prevents the potentially expensive
439potentially expensive RCU magic from being invoked multiple times. 435RCU magic from being invoked multiple times.
440 436
441Should some other single aspect of another task's credentials need to be 437Should some other single aspect of another task's credentials need to be
442accessed, then this can be used: 438accessed, then this can be used:
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process
index d750321acd5a..97726eba6102 100644
--- a/Documentation/development-process/2.Process
+++ b/Documentation/development-process/2.Process
@@ -151,7 +151,7 @@ The stages that a patch goes through are, generally:
151 well. 151 well.
152 152
153 - Wider review. When the patch is getting close to ready for mainline 153 - Wider review. When the patch is getting close to ready for mainline
154 inclusion, it will be accepted by a relevant subsystem maintainer - 154 inclusion, it should be accepted by a relevant subsystem maintainer -
155 though this acceptance is not a guarantee that the patch will make it 155 though this acceptance is not a guarantee that the patch will make it
156 all the way to the mainline. The patch will show up in the maintainer's 156 all the way to the mainline. The patch will show up in the maintainer's
157 subsystem tree and into the staging trees (described below). When the 157 subsystem tree and into the staging trees (described below). When the
@@ -159,6 +159,15 @@ The stages that a patch goes through are, generally:
159 the discovery of any problems resulting from the integration of this 159 the discovery of any problems resulting from the integration of this
160 patch with work being done by others. 160 patch with work being done by others.
161 161
162- Please note that most maintainers also have day jobs, so merging
163 your patch may not be their highest priority. If your patch is
164 getting feedback about changes that are needed, you should either
165 make those changes or justify why they should not be made. If your
166 patch has no review complaints but is not being merged by its
167 appropriate subsystem or driver maintainer, you should be persistent
168 in updating the patch to the current kernel so that it applies cleanly
169 and keep sending it for review and merging.
170
162 - Merging into the mainline. Eventually, a successful patch will be 171 - Merging into the mainline. Eventually, a successful patch will be
163 merged into the mainline repository managed by Linus Torvalds. More 172 merged into the mainline repository managed by Linus Torvalds. More
164 comments and/or problems may surface at this time; it is important that 173 comments and/or problems may surface at this time; it is important that
@@ -258,12 +267,8 @@ an appropriate subsystem tree or be sent directly to Linus. In a typical
258development cycle, approximately 10% of the patches going into the mainline 267development cycle, approximately 10% of the patches going into the mainline
259get there via -mm. 268get there via -mm.
260 269
261The current -mm patch can always be found from the front page of 270The current -mm patch is available in the "mmotm" (-mm of the moment)
262 271directory at:
263 http://kernel.org/
264
265Those who want to see the current state of -mm can get the "-mm of the
266moment" tree, found at:
267 272
268 http://userweb.kernel.org/~akpm/mmotm/ 273 http://userweb.kernel.org/~akpm/mmotm/
269 274
@@ -298,6 +303,12 @@ volatility of linux-next tends to make it a difficult development target.
298See http://lwn.net/Articles/289013/ for more information on this topic, and 303See http://lwn.net/Articles/289013/ for more information on this topic, and
299stay tuned; much is still in flux where linux-next is involved. 304stay tuned; much is still in flux where linux-next is involved.
300 305
306Besides the mmotm and linux-next trees, the kernel source tree now contains
307the drivers/staging/ directory and many sub-directories for drivers or
308filesystems that are on their way to being added to the kernel tree
309proper, but they remain in drivers/staging/ while they still need more
310work.
311
301 312
3022.5: TOOLS 3132.5: TOOLS
303 314
@@ -319,9 +330,9 @@ developers; even if they do not use it for their own work, they'll need git
319to keep up with what other developers (and the mainline) are doing. 330to keep up with what other developers (and the mainline) are doing.
320 331
321Git is now packaged by almost all Linux distributions. There is a home 332Git is now packaged by almost all Linux distributions. There is a home
322page at 333page at:
323 334
324 http://git.or.cz/ 335 http://git-scm.com/
325 336
326That page has pointers to documentation and tutorials. One should be 337That page has pointers to documentation and tutorials. One should be
327aware, in particular, of the Kernel Hacker's Guide to git, which has 338aware, in particular, of the Kernel Hacker's Guide to git, which has
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics
index a2cf74093aa1..837179447e17 100644
--- a/Documentation/development-process/7.AdvancedTopics
+++ b/Documentation/development-process/7.AdvancedTopics
@@ -25,7 +25,7 @@ long document in its own right. Instead, the focus here will be on how git
25fits into the kernel development process in particular. Developers who 25fits into the kernel development process in particular. Developers who
26wish to come up to speed with git will find more information at: 26wish to come up to speed with git will find more information at:
27 27
28 http://git.or.cz/ 28 http://git-scm.com/
29 29
30 http://www.kernel.org/pub/software/scm/git/docs/user-manual.html 30 http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
31 31
diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt
index e3a77b215135..0d5bc46dc167 100644
--- a/Documentation/device-mapper/snapshot.txt
+++ b/Documentation/device-mapper/snapshot.txt
@@ -122,3 +122,47 @@ volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
122brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real 122brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
123brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow 123brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
124brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base 124brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
125
126
127How to determine when a merging is complete
128===========================================
129The snapshot-merge and snapshot status lines end with:
130 <sectors_allocated>/<total_sectors> <metadata_sectors>
131
132Both <sectors_allocated> and <total_sectors> include both data and metadata.
133During merging, the number of sectors allocated gets smaller and
134smaller. Merging has finished when the number of sectors holding data
135is zero, in other words <sectors_allocated> == <metadata_sectors>.
136
137Here is a practical example (using a hybrid of lvm and dmsetup commands):
138
139# lvs
140 LV VG Attr LSize Origin Snap% Move Log Copy% Convert
141 base volumeGroup owi-a- 4.00g
142 snap volumeGroup swi-a- 1.00g base 18.97
143
144# dmsetup status volumeGroup-snap
1450 8388608 snapshot 397896/2097152 1560
146 ^^^^ metadata sectors
147
148# lvconvert --merge -b volumeGroup/snap
149 Merging of volume snap started.
150
151# lvs volumeGroup/snap
152 LV VG Attr LSize Origin Snap% Move Log Copy% Convert
153 base volumeGroup Owi-a- 4.00g 17.23
154
155# dmsetup status volumeGroup-base
1560 8388608 snapshot-merge 281688/2097152 1104
157
158# dmsetup status volumeGroup-base
1590 8388608 snapshot-merge 180480/2097152 712
160
161# dmsetup status volumeGroup-base
1620 8388608 snapshot-merge 16/2097152 16
163
164Merging has finished.
165
166# lvs
167 LV VG Attr LSize Origin Snap% Move Log Copy% Convert
168 base volumeGroup owi-a- 4.00g
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 53d64d382343..1d83d124056c 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -443,6 +443,8 @@ Your cooperation is appreciated.
443 231 = /dev/snapshot System memory snapshot device 443 231 = /dev/snapshot System memory snapshot device
444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) 444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions)
445 233 = /dev/kmview View-OS A process with a view 445 233 = /dev/kmview View-OS A process with a view
446 234 = /dev/btrfs-control Btrfs control device
447 235 = /dev/autofs Autofs control device
446 240-254 Reserved for local use 448 240-254 Reserved for local use
447 255 Reserved for MISC_DYNAMIC_MINOR 449 255 Reserved for MISC_DYNAMIC_MINOR
448 450
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 3ad6acead949..d9bcffd59433 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -69,7 +69,6 @@ av_permissions.h
69bbootsect 69bbootsect
70bin2c 70bin2c
71binkernel.spec 71binkernel.spec
72binoffset
73bootsect 72bootsect
74bounds.h 73bounds.h
75bsetup 74bsetup
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index 2e2c2ea90ceb..41f41632ee55 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -192,7 +192,7 @@ command line. This will execute all matching early_param() callbacks.
192User specified early platform devices will be registered at this point. 192User specified early platform devices will be registered at this point.
193For the early serial console case the user can specify port on the 193For the early serial console case the user can specify port on the
194kernel command line as "earlyprintk=serial.0" where "earlyprintk" is 194kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
195the class string, "serial" is the name of the platfrom driver and 195the class string, "serial" is the name of the platform driver and
1960 is the platform device id. If the id is -1 then the dot and the 1960 is the platform device id. If the id is -1 then the dot and the
197id can be omitted. 197id can be omitted.
198 198
diff --git a/Documentation/dvb/ci.txt b/Documentation/dvb/ci.txt
index 2ecd834585e6..4a0c2b56e690 100644
--- a/Documentation/dvb/ci.txt
+++ b/Documentation/dvb/ci.txt
@@ -41,7 +41,7 @@ This application requires the following to function properly as of now.
41 41
42* Cards that fall in this category 42* Cards that fall in this category
43~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 43~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44At present the cards that fall in this category are the Twinhan and it's 44At present the cards that fall in this category are the Twinhan and its
45clones, these cards are available as VVMER, Tomato, Hercules, Orange and 45clones, these cards are available as VVMER, Tomato, Hercules, Orange and
46so on. 46so on.
47 47
diff --git a/Documentation/dvb/contributors.txt b/Documentation/dvb/contributors.txt
index 4865addebe1c..47c30098dab6 100644
--- a/Documentation/dvb/contributors.txt
+++ b/Documentation/dvb/contributors.txt
@@ -1,7 +1,7 @@
1Thanks go to the following people for patches and contributions: 1Thanks go to the following people for patches and contributions:
2 2
3Michael Hunold <m.hunold@gmx.de> 3Michael Hunold <m.hunold@gmx.de>
4 for the initial saa7146 driver and it's recent overhaul 4 for the initial saa7146 driver and its recent overhaul
5 5
6Christian Theiss 6Christian Theiss
7 for his work on the initial Linux DVB driver 7 for his work on the initial Linux DVB driver
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index 14b7b5a3bcb9..239cbdbf4d12 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -26,7 +26,7 @@ use IO::Handle;
26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", 26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
27 "or51211", "or51132_qam", "or51132_vsb", "bluebird", 27 "or51211", "or51132_qam", "or51132_vsb", "bluebird",
28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718", 28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718",
29 "af9015"); 29 "af9015", "ngene");
30 30
31# Check args 31# Check args
32syntax() if (scalar(@ARGV) != 1); 32syntax() if (scalar(@ARGV) != 1);
@@ -39,7 +39,7 @@ for ($i=0; $i < scalar(@components); $i++) {
39 die $@ if $@; 39 die $@ if $@;
40 print STDERR <<EOF; 40 print STDERR <<EOF;
41Firmware(s) $outfile extracted successfully. 41Firmware(s) $outfile extracted successfully.
42Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware 42Now copy it(them) to either /usr/lib/hotplug/firmware or /lib/firmware
43(depending on configuration of firmware hotplug). 43(depending on configuration of firmware hotplug).
44EOF 44EOF
45 exit(0); 45 exit(0);
@@ -549,6 +549,24 @@ sub af9015 {
549 close INFILE; 549 close INFILE;
550} 550}
551 551
552sub ngene {
553 my $url = "http://www.digitaldevices.de/download/";
554 my $file1 = "ngene_15.fw";
555 my $hash1 = "d798d5a757121174f0dbc5f2833c0c85";
556 my $file2 = "ngene_17.fw";
557 my $hash2 = "26b687136e127b8ac24b81e0eeafc20b";
558
559 checkstandard();
560
561 wgetfile($file1, $url . $file1);
562 verify($file1, $hash1);
563
564 wgetfile($file2, $url . $file2);
565 verify($file2, $hash2);
566
567 "$file1, $file2";
568}
569
552# --------------------------------------------------------------- 570# ---------------------------------------------------------------
553# Utilities 571# Utilities
554 572
@@ -667,6 +685,7 @@ sub delzero{
667sub syntax() { 685sub syntax() {
668 print STDERR "syntax: get_dvb_firmware <component>\n"; 686 print STDERR "syntax: get_dvb_firmware <component>\n";
669 print STDERR "Supported components:\n"; 687 print STDERR "Supported components:\n";
688 @components = sort @components;
670 for($i=0; $i < scalar(@components); $i++) { 689 for($i=0; $i < scalar(@components); $i++) {
671 print STDERR "\t" . $components[$i] . "\n"; 690 print STDERR "\t" . $components[$i] . "\n";
672 } 691 }
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
index 60e361ba08c0..f297fc1202ae 100644
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@@ -171,7 +171,7 @@ device.
171virtual_root.force_probe : 171virtual_root.force_probe :
172 172
173Force the probing code to probe EISA slots even when it cannot find an 173Force the probing code to probe EISA slots even when it cannot find an
174EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0 174EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
175(don't force), and set to 1 (force probing) when either 175(don't force), and set to 1 (force probing) when either
176CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. 176CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
177 177
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt
index a618efab7b15..945ff3fda433 100644
--- a/Documentation/email-clients.txt
+++ b/Documentation/email-clients.txt
@@ -216,26 +216,14 @@ Works. Use "Insert file..." or external editor.
216~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 216~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
217Gmail (Web GUI) 217Gmail (Web GUI)
218 218
219If you just have to use Gmail to send patches, it CAN be made to work. It 219Does not work for sending patches.
220requires a bit of external help, though. 220
221 221Gmail web client converts tabs to spaces automatically.
222The first problem is that Gmail converts tabs to spaces. This will 222
223totally break your patches. To prevent this, you have to use a different 223At the same time it wraps lines every 78 chars with CRLF style line breaks
224editor. There is a firefox extension called "ViewSourceWith" 224although tab2space problem can be solved with external editor.
225(https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to 225
226edit any text box in the editor of your choice. Configure it to launch 226Another problem is that Gmail will base64-encode any message that has a
227your favorite editor. When you want to send a patch, use this technique. 227non-ASCII character. That includes things like European names.
228Once you have crafted your messsage + patch, save and exit the editor,
229which should reload the Gmail edit box. GMAIL WILL PRESERVE THE TABS.
230Hoorah. Apparently you can cut-n-paste literal tabs, but Gmail will
231convert those to spaces upon sending!
232
233The second problem is that Gmail converts tabs to spaces on replies. If
234you reply to a patch, don't expect to be able to apply it as a patch.
235
236The last problem is that Gmail will base64-encode any message that has a
237non-ASCII character. That includes things like European names. Be aware.
238
239Gmail is not convenient for lkml patches, but CAN be made to work.
240 228
241 ### 229 ###
diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt
new file mode 100644
index 000000000000..7a9d3d81525b
--- /dev/null
+++ b/Documentation/fault-injection/provoke-crashes.txt
@@ -0,0 +1,38 @@
1The lkdtm module provides an interface to crash or injure the kernel at
2predefined crashpoints to evaluate the reliability of crash dumps obtained
3using different dumping solutions. The module uses KPROBEs to instrument
4crashing points, but can also crash the kernel directly without KRPOBE
5support.
6
7
8You can provide the way either through module arguments when inserting
9the module, or through a debugfs interface.
10
11Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<>
12 [cpoint_count={>0}]
13
14 recur_count : Recursion level for the stack overflow test. Default is 10.
15
16 cpoint_name : Crash point where the kernel is to be crashed. It can be
17 one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY,
18 FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD,
19 IDE_CORE_CP, DIRECT
20
21 cpoint_type : Indicates the action to be taken on hitting the crash point.
22 It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW,
23 CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION,
24 WRITE_AFTER_FREE,
25
26 cpoint_count : Indicates the number of times the crash point is to be hit
27 to trigger an action. The default is 10.
28
29You can also induce failures by mounting debugfs and writing the type to
30<mountpoint>/provoke-crash/<crashpoint>. E.g.,
31
32 mount -t debugfs debugfs /mnt
33 echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY
34
35
36A special file is `DIRECT' which will induce the crash directly without
37KPROBE instrumentation. This mode is the only one available when the module
38is built on a kernel without KPROBEs support.
diff --git a/Documentation/fb/imacfb.txt b/Documentation/fb/efifb.txt
index 316ec9bb7deb..a59916c29b33 100644
--- a/Documentation/fb/imacfb.txt
+++ b/Documentation/fb/efifb.txt
@@ -1,9 +1,9 @@
1 1
2What is imacfb? 2What is efifb?
3=============== 3===============
4 4
5This is a generic EFI platform driver for Intel based Apple computers. 5This is a generic EFI platform driver for Intel based Apple computers.
6Imacfb is only for EFI booted Intel Macs. 6efifb is only for EFI booted Intel Macs.
7 7
8Supported Hardware 8Supported Hardware
9================== 9==================
@@ -16,16 +16,16 @@ MacMini
16How to use it? 16How to use it?
17============== 17==============
18 18
19Imacfb does not have any kind of autodetection of your machine. 19efifb does not have any kind of autodetection of your machine.
20You have to add the following kernel parameters in your elilo.conf: 20You have to add the following kernel parameters in your elilo.conf:
21 Macbook : 21 Macbook :
22 video=imacfb:macbook 22 video=efifb:macbook
23 MacMini : 23 MacMini :
24 video=imacfb:mini 24 video=efifb:mini
25 Macbook Pro 15", iMac 17" : 25 Macbook Pro 15", iMac 17" :
26 video=imacfb:i17 26 video=efifb:i17
27 Macbook Pro 17", iMac 20" : 27 Macbook Pro 17", iMac 20" :
28 video=imacfb:i20 28 video=efifb:i20
29 29
30-- 30--
31Edgar Hucek <gimli@dark-green.com> 31Edgar Hucek <gimli@dark-green.com>
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 88a216d18092..672be0109d02 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,21 +6,6 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: USER_SCHED
10When: 2.6.34
11
12Why: USER_SCHED was implemented as a proof of concept for group scheduling.
13 The effect of USER_SCHED can already be achieved from userspace with
14 the help of libcgroup. The removal of USER_SCHED will also simplify
15 the scheduler code with the removal of one major ifdef. There are also
16 issues USER_SCHED has with USER_NS. A decision was taken not to fix
17 those and instead remove USER_SCHED. Also new group scheduling
18 features will not be implemented for USER_SCHED.
19
20Who: Dhaval Giani <dhaval@linux.vnet.ibm.com>
21
22---------------------------
23
24What: PRISM54 9What: PRISM54
25When: 2.6.34 10When: 2.6.34
26 11
@@ -64,6 +49,17 @@ Who: Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
64 49
65--------------------------- 50---------------------------
66 51
52What: Deprecated snapshot ioctls
53When: 2.6.36
54
55Why: The ioctls in kernel/power/user.c were marked as deprecated long time
56 ago. Now they notify users about that so that they need to replace
57 their userspace. After some more time, remove them completely.
58
59Who: Jiri Slaby <jirislaby@gmail.com>
60
61---------------------------
62
67What: The ieee80211_regdom module parameter 63What: The ieee80211_regdom module parameter
68When: March 2010 / desktop catchup 64When: March 2010 / desktop catchup
69 65
@@ -88,27 +84,6 @@ Who: Luis R. Rodriguez <lrodriguez@atheros.com>
88 84
89--------------------------- 85---------------------------
90 86
91What: CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information
92When: March 2010 / desktop catchup
93
94Why: The old regulatory infrastructure has been replaced with a new one
95 which does not require statically defined regulatory domains. We do
96 not want to keep static regulatory domains in the kernel due to the
97 the dynamic nature of regulatory law and localization. We kept around
98 the old static definitions for the regulatory domains of:
99
100 * US
101 * JP
102 * EU
103
104 and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
105 set. We will remove this option once the standard Linux desktop catches
106 up with the new userspace APIs we have implemented.
107
108Who: Luis R. Rodriguez <lrodriguez@atheros.com>
109
110---------------------------
111
112What: dev->power.power_state 87What: dev->power.power_state
113When: July 2007 88When: July 2007
114Why: Broken design for runtime control over driver power states, confusing 89Why: Broken design for runtime control over driver power states, confusing
@@ -142,19 +117,25 @@ Who: Mauro Carvalho Chehab <mchehab@infradead.org>
142--------------------------- 117---------------------------
143 118
144What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) 119What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl])
145When: November 2005 120When: 2.6.35/2.6.36
146Files: drivers/pcmcia/: pcmcia_ioctl.c 121Files: drivers/pcmcia/: pcmcia_ioctl.c
147Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a 122Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a
148 normal hotpluggable bus, and with it using the default kernel 123 normal hotpluggable bus, and with it using the default kernel
149 infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA 124 infrastructure (hotplug, driver core, sysfs) keeping the PCMCIA
150 control ioctl needed by cardmgr and cardctl from pcmcia-cs is 125 control ioctl needed by cardmgr and cardctl from pcmcia-cs is
151 unnecessary, and makes further cleanups and integration of the 126 unnecessary and potentially harmful (it does not provide for
127 proper locking), and makes further cleanups and integration of the
152 PCMCIA subsystem into the Linux kernel device driver model more 128 PCMCIA subsystem into the Linux kernel device driver model more
153 difficult. The features provided by cardmgr and cardctl are either 129 difficult. The features provided by cardmgr and cardctl are either
154 handled by the kernel itself now or are available in the new 130 handled by the kernel itself now or are available in the new
155 pcmciautils package available at 131 pcmciautils package available at
156 http://kernel.org/pub/linux/utils/kernel/pcmcia/ 132 http://kernel.org/pub/linux/utils/kernel/pcmcia/
157Who: Dominik Brodowski <linux@brodo.de> 133
134 For all architectures except ARM, the associated config symbol
135 has been removed from kernel 2.6.34; for ARM, it will be likely
136 be removed from kernel 2.6.35. The actual code will then likely
137 be removed from kernel 2.6.36.
138Who: Dominik Brodowski <linux@dominikbrodowski.net>
158 139
159--------------------------- 140---------------------------
160 141
@@ -260,16 +241,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
260 241
261--------------------------- 242---------------------------
262 243
263What (Why):
264 - xt_recent: the old ipt_recent proc dir
265 (superseded by /proc/net/xt_recent)
266
267When: January 2009 or Linux 2.7.0, whichever comes first
268Why: Superseded by newer revisions or modules
269Who: Jan Engelhardt <jengelh@computergmbh.de>
270
271---------------------------
272
273What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib 244What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib
274When: February 2010 245When: February 2010
275Why: All callers should use explicit gpio_request()/gpio_free(). 246Why: All callers should use explicit gpio_request()/gpio_free().
@@ -468,12 +439,6 @@ Who: Alok N Kataria <akataria@vmware.com>
468 439
469---------------------------- 440----------------------------
470 441
471What: adt7473 hardware monitoring driver
472When: February 2010
473Why: Obsoleted by the adt7475 driver.
474Who: Jean Delvare <khali@linux-fr.org>
475
476---------------------------
477What: Support for lcd_switch and display_get in asus-laptop driver 442What: Support for lcd_switch and display_get in asus-laptop driver
478When: March 2010 443When: March 2010
479Why: These two features use non-standard interfaces. There are the 444Why: These two features use non-standard interfaces. There are the
@@ -545,6 +510,142 @@ Who: Hans de Goede <hdegoede@redhat.com>
545 510
546---------------------------- 511----------------------------
547 512
513What: sysfs-class-rfkill state file
514When: Feb 2014
515Files: net/rfkill/core.c
516Why: Documented as obsolete since Feb 2010. This file is limited to 3
517 states while the rfkill drivers can have 4 states.
518Who: anybody or Florian Mickler <florian@mickler.org>
519
520----------------------------
521
522What: sysfs-class-rfkill claim file
523When: Feb 2012
524Files: net/rfkill/core.c
525Why: It is not possible to claim an rfkill driver since 2007. This is
526 Documented as obsolete since Feb 2010.
527Who: anybody or Florian Mickler <florian@mickler.org>
528
529----------------------------
530
531What: capifs
532When: February 2011
533Files: drivers/isdn/capi/capifs.*
534Why: udev fully replaces this special file system that only contains CAPI
535 NCCI TTY device nodes. User space (pppdcapiplugin) works without
536 noticing the difference.
537Who: Jan Kiszka <jan.kiszka@web.de>
538
539----------------------------
540
541What: KVM memory aliases support
542When: July 2010
543Why: Memory aliasing support is used for speeding up guest vga access
544 through the vga windows.
545
546 Modern userspace no longer uses this feature, so it's just bitrotted
547 code and can be removed with no impact.
548Who: Avi Kivity <avi@redhat.com>
549
550----------------------------
551
552What: xtime, wall_to_monotonic
553When: 2.6.36+
554Files: kernel/time/timekeeping.c include/linux/time.h
555Why: Cleaning up timekeeping internal values. Please use
556 existing timekeeping accessor functions to access
557 the equivalent functionality.
558Who: John Stultz <johnstul@us.ibm.com>
559
560----------------------------
561
562What: KVM kernel-allocated memory slots
563When: July 2010
564Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
565 much more flexible than kernel-allocated slots. All current userspace
566 supports the newer interface and this code can be removed with no
567 impact.
568Who: Avi Kivity <avi@redhat.com>
569
570----------------------------
571
572What: KVM paravirt mmu host support
573When: January 2011
574Why: The paravirt mmu host support is slower than non-paravirt mmu, both
575 on newer and older hardware. It is already not exposed to the guest,
576 and kept only for live migration purposes.
577Who: Avi Kivity <avi@redhat.com>
578
579----------------------------
580
581What: "acpi=ht" boot option
582When: 2.6.35
583Why: Useful in 2003, implementation is a hack.
584 Generally invoked by accident today.
585 Seen as doing more harm than good.
586Who: Len Brown <len.brown@intel.com>
587
588----------------------------
589
590What: iwlwifi 50XX module parameters
591When: 2.6.40
592Why: The "..50" modules parameters were used to configure 5000 series and
593 up devices; different set of module parameters also available for 4965
594 with same functionalities. Consolidate both set into single place
595 in drivers/net/wireless/iwlwifi/iwl-agn.c
596
597Who: Wey-Yi Guy <wey-yi.w.guy@intel.com>
598
599----------------------------
600
601What: iwl4965 alias support
602When: 2.6.40
603Why: Internal alias support has been present in module-init-tools for some
604 time, the MODULE_ALIAS("iwl4965") boilerplate aliases can be removed
605 with no impact.
606
607Who: Wey-Yi Guy <wey-yi.w.guy@intel.com>
608
609---------------------------
610
611What: xt_NOTRACK
612Files: net/netfilter/xt_NOTRACK.c
613When: April 2011
614Why: Superseded by xt_CT
615Who: Netfilter developer team <netfilter-devel@vger.kernel.org>
616
617---------------------------
618
619What: video4linux /dev/vtx teletext API support
620When: 2.6.35
621Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
622 include/linux/videotext.h
623Why: The vtx device nodes have been superseded by vbi device nodes
624 for many years. No applications exist that use the vtx support.
625 Of the two i2c drivers that actually support this API the saa5249
626 has been impossible to use for a year now and no known hardware
627 that supports this device exists. The saa5246a is theoretically
628 supported by the old mxb boards, but it never actually worked.
629
630 In summary: there is no hardware that can use this API and there
631 are no applications actually implementing this API.
632
633 The vtx support still reserves minors 192-223 and we would really
634 like to reuse those for upcoming new functionality. In the unlikely
635 event that new hardware appears that wants to use the functionality
636 provided by the vtx API, then that functionality should be build
637 around the sliced VBI API instead.
638Who: Hans Verkuil <hverkuil@xs4all.nl>
639
640----------------------------
641
642What: IRQF_DISABLED
643When: 2.6.36
644Why: The flag is a NOOP as we run interrupt handlers with interrupts disabled
645Who: Thomas Gleixner <tglx@linutronix.de>
646
647----------------------------
648
548What: old ieee1394 subsystem (CONFIG_IEEE1394) 649What: old ieee1394 subsystem (CONFIG_IEEE1394)
549When: 2.6.37 650When: 2.6.37
550Files: drivers/ieee1394/ except init_ohci1394_dma.c 651Files: drivers/ieee1394/ except init_ohci1394_dma.c
@@ -552,3 +653,6 @@ Why: superseded by drivers/firewire/ (CONFIG_FIREWIRE) which offers more
552 features, better performance, and better security, all with smaller 653 features, better performance, and better security, all with smaller
553 and more modern code base 654 and more modern code base
554Who: Stefan Richter <stefanr@s5r6.in-berlin.de> 655Who: Stefan Richter <stefanr@s5r6.in-berlin.de>
656
657----------------------------
658
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 875d49696b6e..4303614b5add 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -16,6 +16,8 @@ befs.txt
16 - information about the BeOS filesystem for Linux. 16 - information about the BeOS filesystem for Linux.
17bfs.txt 17bfs.txt
18 - info for the SCO UnixWare Boot Filesystem (BFS). 18 - info for the SCO UnixWare Boot Filesystem (BFS).
19ceph.txt
20 - info for the Ceph Distributed File System
19cifs.txt 21cifs.txt
20 - description of the CIFS filesystem. 22 - description of the CIFS filesystem.
21coda.txt 23coda.txt
@@ -32,6 +34,8 @@ dlmfs.txt
32 - info on the userspace interface to the OCFS2 DLM. 34 - info on the userspace interface to the OCFS2 DLM.
33dnotify.txt 35dnotify.txt
34 - info about directory notification in Linux. 36 - info about directory notification in Linux.
37dnotify_test.c
38 - example program for dnotify
35ecryptfs.txt 39ecryptfs.txt
36 - docs on eCryptfs: stacked cryptographic filesystem for Linux. 40 - docs on eCryptfs: stacked cryptographic filesystem for Linux.
37exofs.txt 41exofs.txt
@@ -62,6 +66,8 @@ jfs.txt
62 - info and mount options for the JFS filesystem. 66 - info and mount options for the JFS filesystem.
63locks.txt 67locks.txt
64 - info on file locking implementations, flock() vs. fcntl(), etc. 68 - info on file locking implementations, flock() vs. fcntl(), etc.
69logfs.txt
70 - info on the LogFS flash filesystem.
65mandatory-locking.txt 71mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 72 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 73ncpfs.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index 57e0b80a5274..c0236e753bc8 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9)
37 37
38 mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER 38 mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
39 39
40For server running on QEMU host with virtio transport:
41
42 mount -t 9p -o trans=virtio <mount_tag> /mnt/9
43
44where mount_tag is the tag associated by the server to each of the exported
45mount points. Each 9P export is seen by the client as a virtio device with an
46associated "mount_tag" property. Available mount tags can be
47seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
48
40OPTIONS 49OPTIONS
41======= 50=======
42 51
@@ -47,7 +56,7 @@ OPTIONS
47 fd - used passed file descriptors for connection 56 fd - used passed file descriptors for connection
48 (see rfdno and wfdno) 57 (see rfdno and wfdno)
49 virtio - connect to the next virtio channel available 58 virtio - connect to the next virtio channel available
50 (from lguest or KVM with trans_virtio module) 59 (from QEMU with trans_virtio module)
51 rdma - connect to a specified RDMA channel 60 rdma - connect to a specified RDMA channel
52 61
53 uname=name user name to attempt mount as on the remote server. The 62 uname=name user name to attempt mount as on the remote server. The
@@ -85,7 +94,12 @@ OPTIONS
85 94
86 port=n port to connect to on the remote server 95 port=n port to connect to on the remote server
87 96
88 noextend force legacy mode (no 9p2000.u semantics) 97 noextend force legacy mode (no 9p2000.u or 9p2000.L semantics)
98
99 version=name Select 9P protocol version. Valid options are:
100 9p2000 - Legacy mode (same as noextend)
101 9p2000.u - Use 9P2000.u protocol
102 9p2000.L - Use 9P2000.L protocol
89 103
90 dfltuid attempt to mount as a particular uid 104 dfltuid attempt to mount as a particular uid
91 105
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca0630..61c98f03baa1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,7 +178,7 @@ prototypes:
178locking rules: 178locking rules:
179 All except set_page_dirty may block 179 All except set_page_dirty may block
180 180
181 BKL PageLocked(page) i_sem 181 BKL PageLocked(page) i_mutex
182writepage: no yes, unlocks (see below) 182writepage: no yes, unlocks (see below)
183readpage: no yes, unlocks 183readpage: no yes, unlocks
184sync_page: no maybe 184sync_page: no maybe
@@ -429,8 +429,9 @@ check_flags: no
429implementations. If your fs is not using generic_file_llseek, you 429implementations. If your fs is not using generic_file_llseek, you
430need to acquire and release the appropriate locks in your ->llseek(). 430need to acquire and release the appropriate locks in your ->llseek().
431For many filesystems, it is probably safe to acquire the inode 431For many filesystems, it is probably safe to acquire the inode
432semaphore. Note some filesystems (i.e. remote ones) provide no 432mutex or just to use i_size_read() instead.
433protection for i_size so you will need to use the BKL. 433Note: this does not protect the file->f_pos against concurrent modifications
434since this is something the userspace has to take care about.
434 435
435Note: ext2_release() was *the* source of contention on fs-intensive 436Note: ext2_release() was *the* source of contention on fs-intensive
436loads and dropping BKL on ->release() helps to get rid of that (we still 437loads and dropping BKL on ->release() helps to get rid of that (we still
@@ -460,13 +461,6 @@ in sys_read() and friends.
460 461
461--------------------------- dquot_operations ------------------------------- 462--------------------------- dquot_operations -------------------------------
462prototypes: 463prototypes:
463 int (*initialize) (struct inode *, int);
464 int (*drop) (struct inode *);
465 int (*alloc_space) (struct inode *, qsize_t, int);
466 int (*alloc_inode) (const struct inode *, unsigned long);
467 int (*free_space) (struct inode *, qsize_t);
468 int (*free_inode) (const struct inode *, unsigned long);
469 int (*transfer) (struct inode *, struct iattr *);
470 int (*write_dquot) (struct dquot *); 464 int (*write_dquot) (struct dquot *);
471 int (*acquire_dquot) (struct dquot *); 465 int (*acquire_dquot) (struct dquot *);
472 int (*release_dquot) (struct dquot *); 466 int (*release_dquot) (struct dquot *);
@@ -479,13 +473,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
479What filesystem should expect from the generic quota functions: 473What filesystem should expect from the generic quota functions:
480 474
481 FS recursion Held locks when called 475 FS recursion Held locks when called
482initialize: yes maybe dqonoff_sem
483drop: yes -
484alloc_space: ->mark_dirty() -
485alloc_inode: ->mark_dirty() -
486free_space: ->mark_dirty() -
487free_inode: ->mark_dirty() -
488transfer: yes -
489write_dquot: yes dqonoff_sem or dqptr_sem 476write_dquot: yes dqonoff_sem or dqptr_sem
490acquire_dquot: yes dqonoff_sem or dqptr_sem 477acquire_dquot: yes dqonoff_sem or dqptr_sem
491release_dquot: yes dqonoff_sem or dqptr_sem 478release_dquot: yes dqonoff_sem or dqptr_sem
@@ -495,10 +482,6 @@ write_info: yes dqonoff_sem
495FS recursion means calling ->quota_read() and ->quota_write() from superblock 482FS recursion means calling ->quota_read() and ->quota_write() from superblock
496operations. 483operations.
497 484
498->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
499only directly by the filesystem and do not call any fs functions only
500the ->mark_dirty() operation.
501
502More details about quota locking can be found in fs/dquot.c. 485More details about quota locking can be found in fs/dquot.c.
503 486
504--------------------------- vm_operations_struct ----------------------------- 487--------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 000000000000..a5dd114da14f
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
1# kbuild trick to avoid linker error. Can be omitted if a module is built.
2obj- := dummy.o
3
4# List of programs to build
5hostprogs-y := dnotify_test
6
7# Tell kbuild to always build the programs
8always := $(hostprogs-y)
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index 8f78ded4b648..51986bf08a4d 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was
146used for this as raw Netlink would lead to a significant increase in 146used for this as raw Netlink would lead to a significant increase in
147complexity. There's no question that the Generic Netlink system is an 147complexity. There's no question that the Generic Netlink system is an
148elegant solution for common case ioctl functions but it's not a complete 148elegant solution for common case ioctl functions but it's not a complete
149replacement probably because it's primary purpose in life is to be a 149replacement probably because its primary purpose in life is to be a
150message bus implementation rather than specifically an ioctl replacement. 150message bus implementation rather than specifically an ioctl replacement.
151While it would be possible to work around this there is one concern 151While it would be possible to work around this there is one concern
152that lead to the decision to not use it. This is that the autofs 152that lead to the decision to not use it. This is that the autofs
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 000000000000..763d8ebbbebd
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,140 @@
1Ceph Distributed File System
2============================
3
4Ceph is a distributed network file system designed to provide good
5performance, reliability, and scalability.
6
7Basic features include:
8
9 * POSIX semantics
10 * Seamless scaling from 1 to many thousands of nodes
11 * High availability and reliability. No single point of failure.
12 * N-way replication of data across storage nodes
13 * Fast recovery from node failures
14 * Automatic rebalancing of data on node addition/removal
15 * Easy deployment: most FS components are userspace daemons
16
17Also,
18 * Flexible snapshots (on any directory)
19 * Recursive accounting (nested files, directories, bytes)
20
21In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
22on symmetric access by all clients to shared block devices, Ceph
23separates data and metadata management into independent server
24clusters, similar to Lustre. Unlike Lustre, however, metadata and
25storage nodes run entirely as user space daemons. Storage nodes
26utilize btrfs to store data objects, leveraging its advanced features
27(checksumming, metadata replication, etc.). File data is striped
28across storage nodes in large chunks to distribute workload and
29facilitate high throughputs. When storage nodes fail, data is
30re-replicated in a distributed fashion by the storage nodes themselves
31(with some minimal coordination from a cluster monitor), making the
32system extremely efficient and scalable.
33
34Metadata servers effectively form a large, consistent, distributed
35in-memory cache above the file namespace that is extremely scalable,
36dynamically redistributes metadata in response to workload changes,
37and can tolerate arbitrary (well, non-Byzantine) node failures. The
38metadata server takes a somewhat unconventional approach to metadata
39storage to significantly improve performance for common workloads. In
40particular, inodes with only a single link are embedded in
41directories, allowing entire directories of dentries and inodes to be
42loaded into its cache with a single I/O operation. The contents of
43extremely large directories can be fragmented and managed by
44independent metadata servers, allowing scalable concurrent access.
45
46The system offers automatic data rebalancing/migration when scaling
47from a small cluster of just a few nodes to many hundreds, without
48requiring an administrator carve the data set into static volumes or
49go through the tedious process of migrating data between servers.
50When the file system approaches full, new nodes can be easily added
51and things will "just work."
52
53Ceph includes flexible snapshot mechanism that allows a user to create
54a snapshot on any subdirectory (and its nested contents) in the
55system. Snapshot creation and deletion are as simple as 'mkdir
56.snap/foo' and 'rmdir .snap/foo'.
57
58Ceph also provides some recursive accounting on directories for nested
59files and bytes. That is, a 'getfattr -d foo' on any directory in the
60system will reveal the total number of nested regular files and
61subdirectories, and a summation of all nested file sizes. This makes
62the identification of large disk space consumers relatively quick, as
63no 'du' or similar recursive scan of the file system is required.
64
65
66Mount Syntax
67============
68
69The basic mount syntax is:
70
71 # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
72
73You only need to specify a single monitor, as the client will get the
74full list when it connects. (However, if the monitor you specify
75happens to be down, the mount won't succeed.) The port can be left
76off if the monitor is using the default. So if the monitor is at
771.2.3.4,
78
79 # mount -t ceph 1.2.3.4:/ /mnt/ceph
80
81is sufficient. If /sbin/mount.ceph is installed, a hostname can be
82used instead of an IP address.
83
84
85
86Mount Options
87=============
88
89 ip=A.B.C.D[:N]
90 Specify the IP and/or port the client should bind to locally.
91 There is normally not much reason to do this. If the IP is not
92 specified, the client's IP address is determined by looking at the
93 address its connection to the monitor originates from.
94
95 wsize=X
96 Specify the maximum write size in bytes. By default there is no
97 maximum. Ceph will normally size writes based on the file stripe
98 size.
99
100 rsize=X
101 Specify the maximum readahead.
102
103 mount_timeout=X
104 Specify the timeout value for mount (in seconds), in the case
105 of a non-responsive Ceph file system. The default is 30
106 seconds.
107
108 rbytes
109 When stat() is called on a directory, set st_size to 'rbytes',
110 the summation of file sizes over all files nested beneath that
111 directory. This is the default.
112
113 norbytes
114 When stat() is called on a directory, set st_size to the
115 number of entries in that directory.
116
117 nocrc
118 Disable CRC32C calculation for data writes. If set, the storage node
119 must rely on TCP's error correction to detect data corruption
120 in the data payload.
121
122 noasyncreaddir
123 Disable client's use its local cache to satisfy readdir
124 requests. (This does not change correctness; the client uses
125 cached metadata only when a lease or capability ensures it is
126 valid.)
127
128
129More Information
130================
131
132For more information on Ceph, see the home page at
133 http://ceph.newdream.net/
134
135The Linux kernel client source tree is available at
136 git://ceph.newdream.net/git/ceph-client.git
137 git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
138
139and the source for the full system is at
140 git://ceph.newdream.net/git/ceph.git
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 4c0c575a4012..79334ed5daa7 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -62,7 +62,8 @@ changes are :
622. Insertion of a dentry into the hash table is done using 622. Insertion of a dentry into the hash table is done using
63 hlist_add_head_rcu() which take care of ordering the writes - the 63 hlist_add_head_rcu() which take care of ordering the writes - the
64 writes to the dentry must be visible before the dentry is 64 writes to the dentry must be visible before the dentry is
65 inserted. This works in conjunction with hlist_for_each_rcu() while 65 inserted. This works in conjunction with hlist_for_each_rcu(),
66 which has since been replaced by hlist_for_each_entry_rcu(), while
66 walking the hash chain. The only requirement is that all 67 walking the hash chain. The only requirement is that all
67 initialization to the dentry must be done before 68 initialization to the dentry must be done before
68 hlist_add_head_rcu() since we don't have dcache_lock protection 69 hlist_add_head_rcu() since we don't have dcache_lock protection
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
index c50bbb2d52b4..1b528b2ad809 100644
--- a/Documentation/filesystems/dlmfs.txt
+++ b/Documentation/filesystems/dlmfs.txt
@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in
47your lockspace can access. The easiest way to do this is via 47your lockspace can access. The easiest way to do this is via
48ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires 48ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
49that an OCFS2 file system be in place so that it can automatically 49that an OCFS2 file system be in place so that it can automatically
50find it's heartbeat area, though it will eventually support heartbeat 50find its heartbeat area, though it will eventually support heartbeat
51against raw disks. 51against raw disks.
52 52
53Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed 53Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
index 9f5d338ddbb8..6baf88f46859 100644
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
62 62
63Example 63Example
64------- 64-------
65See Documentation/filesystems/dnotify_test.c for an example.
65 66
66 #define _GNU_SOURCE /* needed to get the defines */ 67NOTE
67 #include <fcntl.h> /* in glibc 2.2 this has the needed 68----
68 values defined */ 69Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
69 #include <signal.h> 70See Documentation/filesystems/inotify.txt for more information on it.
70 #include <stdio.h>
71 #include <unistd.h>
72
73 static volatile int event_fd;
74
75 static void handler(int sig, siginfo_t *si, void *data)
76 {
77 event_fd = si->si_fd;
78 }
79
80 int main(void)
81 {
82 struct sigaction act;
83 int fd;
84
85 act.sa_sigaction = handler;
86 sigemptyset(&act.sa_mask);
87 act.sa_flags = SA_SIGINFO;
88 sigaction(SIGRTMIN + 1, &act, NULL);
89
90 fd = open(".", O_RDONLY);
91 fcntl(fd, F_SETSIG, SIGRTMIN + 1);
92 fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
93 /* we will now be notified if any of the files
94 in "." is modified or new files are created */
95 while (1) {
96 pause();
97 printf("Got event on fd=%d\n", event_fd);
98 }
99 }
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 000000000000..8b37b4a1e18d
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
1#define _GNU_SOURCE /* needed to get the defines */
2#include <fcntl.h> /* in glibc 2.2 this has the needed
3 values defined */
4#include <signal.h>
5#include <stdio.h>
6#include <unistd.h>
7
8static volatile int event_fd;
9
10static void handler(int sig, siginfo_t *si, void *data)
11{
12 event_fd = si->si_fd;
13}
14
15int main(void)
16{
17 struct sigaction act;
18 int fd;
19
20 act.sa_sigaction = handler;
21 sigemptyset(&act.sa_mask);
22 act.sa_flags = SA_SIGINFO;
23 sigaction(SIGRTMIN + 1, &act, NULL);
24
25 fd = open(".", O_RDONLY);
26 fcntl(fd, F_SETSIG, SIGRTMIN + 1);
27 fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
28 /* we will now be notified if any of the files
29 in "." is modified or new files are created */
30 while (1) {
31 pause();
32 printf("Got event on fd=%d\n", event_fd);
33 }
34}
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 867c5b50cb42..272f80d5f966 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -59,8 +59,19 @@ commit=nrsec (*) Ext3 can be told to sync all its data and metadata
59 Setting it to very large values will improve 59 Setting it to very large values will improve
60 performance. 60 performance.
61 61
62barrier=1 This enables/disables barriers. barrier=0 disables 62barrier=<0(*)|1> This enables/disables the use of write barriers in
63 it, barrier=1 enables it. 63barrier the jbd code. barrier=0 disables, barrier=1 enables.
64nobarrier (*) This also requires an IO stack which can support
65 barriers, and if jbd gets an error on a barrier
66 write, it will disable again with a warning.
67 Write barriers enforce proper on-disk ordering
68 of journal commits, making volatile disk write caches
69 safe to use, at some performance penalty. If
70 your disks are battery-backed in one way or another,
71 disabling barriers may safely improve performance.
72 The mount options "barrier" and "nobarrier" can
73 also be used to enable or disable barriers, for
74 consistency with other ext3 mount options.
64 75
65orlov (*) This enables the new Orlov block allocator. It is 76orlov (*) This enables the new Orlov block allocator. It is
66 enabled by default. 77 enabled by default.
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
index 606233cd4618..1b805a0efbb0 100644
--- a/Documentation/filesystems/fiemap.txt
+++ b/Documentation/filesystems/fiemap.txt
@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain
38the set of flags which caused the error. If the kernel is compatible 38the set of flags which caused the error. If the kernel is compatible
39with all flags passed, the contents of fm_flags will be unmodified. 39with all flags passed, the contents of fm_flags will be unmodified.
40It is up to userspace to determine whether rejection of a particular 40It is up to userspace to determine whether rejection of a particular
41flag is fatal to it's operation. This scheme is intended to allow the 41flag is fatal to its operation. This scheme is intended to allow the
42fiemap interface to grow in the future but without losing 42fiemap interface to grow in the future but without losing
43compatibility with old software. 43compatibility with old software.
44 44
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents.
56 56
57* FIEMAP_FLAG_XATTR 57* FIEMAP_FLAG_XATTR
58If this flag is set, the extents returned will describe the inodes 58If this flag is set, the extents returned will describe the inodes
59extended attribute lookup tree, instead of it's data tree. 59extended attribute lookup tree, instead of its data tree.
60 60
61 61
62Extent Mapping 62Extent Mapping
@@ -89,7 +89,7 @@ struct fiemap_extent {
89}; 89};
90 90
91All offsets and lengths are in bytes and mirror those on disk. It is valid 91All offsets and lengths are in bytes and mirror those on disk. It is valid
92for an extents logical offset to start before the request or it's logical 92for an extents logical offset to start before the request or its logical
93length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is 93length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
94returned, fe_logical, fe_physical, and fe_length will be aligned to the 94returned, fe_logical, fe_physical, and fe_length will be aligned to the
95block size of the file system. With the exception of extents flagged as 95block size of the file system. With the exception of extents flagged as
@@ -125,7 +125,7 @@ been allocated for the file yet.
125 125
126* FIEMAP_EXTENT_DELALLOC 126* FIEMAP_EXTENT_DELALLOC
127 - This will also set FIEMAP_EXTENT_UNKNOWN. 127 - This will also set FIEMAP_EXTENT_UNKNOWN.
128Delayed allocation - while there is data for this extent, it's 128Delayed allocation - while there is data for this extent, its
129physical location has not been allocated yet. 129physical location has not been allocated yet.
130 130
131* FIEMAP_EXTENT_ENCODED 131* FIEMAP_EXTENT_ENCODED
@@ -159,7 +159,7 @@ Data is located within a meta data block.
159Data is packed into a block with data from other files. 159Data is packed into a block with data from other files.
160 160
161* FIEMAP_EXTENT_UNWRITTEN 161* FIEMAP_EXTENT_UNWRITTEN
162Unwritten extent - the extent is allocated but it's data has not been 162Unwritten extent - the extent is allocated but its data has not been
163initialized. This indicates the extent's data will be all zero if read 163initialized. This indicates the extent's data will be all zero if read
164through the filesystem but the contents are undefined if read directly from 164through the filesystem but the contents are undefined if read directly from
165the device. 165the device.
@@ -176,7 +176,7 @@ VFS -> File System Implementation
176 176
177File systems wishing to support fiemap must implement a ->fiemap callback on 177File systems wishing to support fiemap must implement a ->fiemap callback on
178their inode_operations structure. The fs ->fiemap call is responsible for 178their inode_operations structure. The fs ->fiemap call is responsible for
179defining it's set of supported fiemap flags, and calling a helper function on 179defining its set of supported fiemap flags, and calling a helper function on
180each discovered extent: 180each discovered extent:
181 181
182struct inode_operations { 182struct inode_operations {
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
index 397a41adb4c3..13af4a49e7db 100644
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -91,7 +91,7 @@ Mount options
91'default_permissions' 91'default_permissions'
92 92
93 By default FUSE doesn't check file access permissions, the 93 By default FUSE doesn't check file access permissions, the
94 filesystem is free to implement it's access policy or leave it to 94 filesystem is free to implement its access policy or leave it to
95 the underlying file access mechanism (e.g. in case of network 95 the underlying file access mechanism (e.g. in case of network
96 filesystems). This option enables permission checking, restricting 96 filesystems). This option enables permission checking, restricting
97 access based on file mode. It is usually useful together with the 97 access based on file mode. It is usually useful together with the
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with
171the error set to EINTR. 171the error set to EINTR.
172 172
173It is also possible that there's a race between processing the 173It is also possible that there's a race between processing the
174original request and it's INTERRUPT request. There are two possibilities: 174original request and its INTERRUPT request. There are two possibilities:
175 175
176 1) The INTERRUPT request is processed before the original request is 176 1) The INTERRUPT request is processed before the original request is
177 processed 177 processed
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 5e3ab8f3beff..0b59c0200912 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -1,7 +1,7 @@
1Global File System 1Global File System
2------------------ 2------------------
3 3
4http://sources.redhat.com/cluster/ 4http://sources.redhat.com/cluster/wiki/
5 5
6GFS is a cluster file system. It allows a cluster of computers to 6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC, 7simultaneously use a block device that is shared between them (with FC,
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it
36is pretty close. 36is pretty close.
37 37
38The following man pages can be found at the URL above: 38The following man pages can be found at the URL above:
39 fsck.gfs2 to repair a filesystem 39 fsck.gfs2 to repair a filesystem
40 gfs2_grow to expand a filesystem online 40 gfs2_grow to expand a filesystem online
41 gfs2_jadd to add journals to a filesystem online 41 gfs2_jadd to add journals to a filesystem online
42 gfs2_tool to manipulate, examine and tune a filesystem 42 gfs2_tool to manipulate, examine and tune a filesystem
43 gfs2_quota to examine and change quota values in a filesystem 43 gfs2_quota to examine and change quota values in a filesystem
44 gfs2_convert to convert a gfs filesystem to gfs2 in-place 44 gfs2_convert to convert a gfs filesystem to gfs2 in-place
45 mount.gfs2 to help mount(8) mount a filesystem 45 mount.gfs2 to help mount(8) mount a filesystem
46 mkfs.gfs2 to make a filesystem 46 mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt
index fa45c3baed98..74630bd504fb 100644
--- a/Documentation/filesystems/hpfs.txt
+++ b/Documentation/filesystems/hpfs.txt
@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI.
103Codepages 103Codepages
104 104
105HPFS can contain several uppercasing tables for several codepages and each 105HPFS can contain several uppercasing tables for several codepages and each
106file has a pointer to codepage it's name is in. However OS/2 was created in 106file has a pointer to codepage its name is in. However OS/2 was created in
107America where people don't care much about codepages and so multiple codepages 107America where people don't care much about codepages and so multiple codepages
108support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. 108support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
109Once I booted English OS/2 working in cp 850 and I created a file on my 852 109Once I booted English OS/2 working in cp 850 and I created a file on my 852
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 000000000000..bca42c22a143
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
1
2The LogFS Flash Filesystem
3==========================
4
5Specification
6=============
7
8Superblocks
9-----------
10
11Two superblocks exist at the beginning and end of the filesystem.
12Each superblock is 256 Bytes large, with another 3840 Bytes reserved
13for future purposes, making a total of 4096 Bytes.
14
15Superblock locations may differ for MTD and block devices. On MTD the
16first non-bad block contains a superblock in the first 4096 Bytes and
17the last non-bad block contains a superblock in the last 4096 Bytes.
18On block devices, the first 4096 Bytes of the device contain the first
19superblock and the last aligned 4096 Byte-block contains the second
20superblock.
21
22For the most part, the superblocks can be considered read-only. They
23are written only to correct errors detected within the superblocks,
24move the journal and change the filesystem parameters through tunefs.
25As a result, the superblock does not contain any fields that require
26constant updates, like the amount of free space, etc.
27
28Segments
29--------
30
31The space in the device is split up into equal-sized segments.
32Segments are the primary write unit of LogFS. Within each segments,
33writes happen from front (low addresses) to back (high addresses. If
34only a partial segment has been written, the segment number, the
35current position within and optionally a write buffer are stored in
36the journal.
37
38Segments are erased as a whole. Therefore Garbage Collection may be
39required to completely free a segment before doing so.
40
41Journal
42--------
43
44The journal contains all global information about the filesystem that
45is subject to frequent change. At mount time, it has to be scanned
46for the most recent commit entry, which contains a list of pointers to
47all currently valid entries.
48
49Object Store
50------------
51
52All space except for the superblocks and journal is part of the object
53store. Each segment contains a segment header and a number of
54objects, each consisting of the object header and the payload.
55Objects are either inodes, directory entries (dentries), file data
56blocks or indirect blocks.
57
58Levels
59------
60
61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is separated
63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
66respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks.
68
69Each segment contains objects of a single level only. As a result,
70each level requires its own separate segment to be open for writing.
71
72Inode File
73----------
74
75All inodes are stored in a special file, the inode file. Single
76exception is the inode file's inode (master inode) which for obvious
77reasons is stored in the journal instead. Instead of data blocks, the
78leaf nodes of the inode files are inodes.
79
80Aliases
81-------
82
83Writes in LogFS are done by means of a wandering tree. A naïve
84implementation would require that for each write or a block, all
85parent blocks are written as well, since the block pointers have
86changed. Such an implementation would not be very efficient.
87
88In LogFS, the block pointer changes are cached in the journal by means
89of alias entries. Each alias consists of its logical address - inode
90number, block index, level and child number (index into block) - and
91the changed data. Any 8-byte word can be changes in this manner.
92
93Currently aliases are used for block pointers, file size, file used
94bytes and the height of an inodes indirect tree.
95
96Segment Aliases
97---------------
98
99Related to regular aliases, these are used to handle bad blocks.
100Initially, bad blocks are handled by moving the affected segment
101content to a spare segment and noting this move in the journal with a
102segment alias, a simple (to, from) tupel. GC will later empty this
103segment and the alias can be removed again. This is used on MTD only.
104
105Vim
106---
107
108By cleverly predicting the life time of data, it is possible to
109separate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can
111have a separate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth.
114
115Indirect Tree
116-------------
117
118Inodes in LogFS are similar to FFS-style filesystems with direct and
119indirect block pointers. One difference is that LogFS uses a single
120indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
121A height field in the inode defines the height of the indirect tree
122and thereby the indirection of the pointer.
123
124Another difference is the addressing of indirect blocks. In LogFS,
125the first 16 pointers in the first indirect block are left empty,
126corresponding to the 16 direct pointers in the inode. In ext2 (maybe
127others as well) the first pointer in the first indirect block
128corresponds to logical block 12, skipping the 12 direct pointers.
129So where ext2 is using arithmetic to better utilize space, LogFS keeps
130arithmetic simple and uses compression to save space.
131
132Compression
133-----------
134
135Both file data and metadata can be compressed. Compression for file
136data can be enabled with chattr +c and disabled with chattr -c. Doing
137so has no effect on existing data, but new data will be stored
138accordingly. New inodes will inherit the compression flag of the
139parent directory.
140
141Metadata is always compressed. However, the space accounting ignores
142this and charges for the uncompressed size. Failing to do so could
143result in GC failures when, after moving some data, indirect blocks
144compress worse than previously. Even on a 100% full medium, GC may
145not consume any extra space, so the compression gains are lost space
146to the user.
147
148However, they are not lost space to the filesystem internals. By
149cheating the user for those bytes, the filesystem gained some slack
150space and GC will run less often and faster.
151
152Garbage Collection and Wear Leveling
153------------------------------------
154
155Garbage collection is invoked whenever the number of free segments
156falls below a threshold. The best (known) candidate is picked based
157on the least amount of valid data contained in the segment. All
158remaining valid data is copied elsewhere, thereby invalidating it.
159
160The GC code also checks for aliases and writes then back if their
161number gets too large.
162
163Wear leveling is done by occasionally picking a suboptimal segment for
164garbage collection. If a stale segments erase count is significantly
165lower than the active segments' erase counts, it will be picked. Wear
166leveling is rate limited, so it will never monopolize the device for
167more than one segment worth at a time.
168
169Values for "occasionally", "significantly lower" are compile time
170constants.
171
172Hashed directories
173------------------
174
175To satisfy efficient lookup(), directory entries are hashed and
176located based on the hash. In order to both support large directories
177and not be overly inefficient for small directories, several hash
178tables of increasing size are used. For each table, the hash value
179modulo the table size gives the table index.
180
181Tables sizes are chosen to limit the number of indirect blocks with a
182fully populated table to 0, 1, 2 or 3 respectively. So the first
183table contains 16 entries, the second 512-16, etc.
184
185The last table is special in several ways. First its size depends on
186the effective 32bit limit on telldir/seekdir cookies. Since logfs
187uses the upper half of the address space for indirect blocks, the size
188is limited to 2^31. Secondly the table contains hash buckets with 16
189entries each.
190
191Using single-entry buckets would result in birthday "attacks". At
192just 2^16 used entries, hash collisions would be likely (P >= 0.5).
193My math skills are insufficient to do the combinatorics for the 17x
194collisions necessary to overflow a bucket, but testing showed that in
19510,000 runs the lowest directory fill before a bucket overflow was
196188,057,130 entries with an average of 315,149,915 entries. So for
197directory sizes of up to a million, bucket overflows should be
198virtually impossible under normal circumstances.
199
200With carefully chosen filenames, it is obviously possible to cause an
201overflow with just 21 entries (4 higher tables + 16 entries + 1). So
202there may be a security concern if a malicious user has write access
203to a directory.
204
205Open For Discussion
206===================
207
208Device Address Space
209--------------------
210
211A device address space is used for caching. Both block devices and
212MTD provide functions to either read a single page or write a segment.
213Partial segments may be written for data integrity, but where possible
214complete segments are written for performance on simple block device
215flash media.
216
217Meta Inodes
218-----------
219
220Inodes are stored in the inode file, which is just a regular file for
221most purposes. At umount time, however, the inode file needs to
222remain open until all dirty inodes are written. So
223generic_shutdown_super() may not close this inode, but shouldn't
224complain about remaining inodes due to the inode file either. Same
225goes for mapping inode of the device address space.
226
227Currently logfs uses a hack that essentially copies part of fs/inode.c
228code over. A general solution would be preferred.
229
230Indirect block mapping
231----------------------
232
233With compression, the block device (or mapping inode) cannot be used
234to cache indirect blocks. Some other place is required. Currently
235logfs uses the top half of each inode's address space. The low 8TB
236(on 32bit) are filled with file data, the high 8TB are used for
237indirect blocks.
238
239One problem is that 16TB files created on 64bit systems actually have
240data in the top 8TB. But files >16TB would cause problems anyway, so
241only the limit has changed.
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 1bd0d0c05171..04884914a1c8 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
17on or off; rpc.nfsd does this correctly.) 17on or off; rpc.nfsd does this correctly.)
18 18
19The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based 19The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
20on the latest NFSv4.1 Internet Draft: 20on RFC 5661.
21http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
22 21
23From the many new features in NFSv4.1 the current implementation 22From the many new features in NFSv4.1 the current implementation
24focuses on the mandatory-to-implement NFSv4.1 Sessions, providing 23focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues:
44 trunking, but this is a mandatory feature, and its use is 43 trunking, but this is a mandatory feature, and its use is
45 recommended to clients in a number of places. (E.g. to ensure 44 recommended to clients in a number of places. (E.g. to ensure
46 timely renewal in case an existing connection's retry timeouts 45 timely renewal in case an existing connection's retry timeouts
47 have gotten too long; see section 8.3 of the draft.) 46 have gotten too long; see section 8.3 of the RFC.)
48 Therefore, lack of this feature may cause future clients to 47 Therefore, lack of this feature may cause future clients to
49 fail. 48 fail.
50 - Incomplete backchannel support: incomplete backchannel gss 49 - Incomplete backchannel support: incomplete backchannel gss
@@ -138,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 |
138 | READ | REQ | | Section 18.22 | 137 | READ | REQ | | Section 18.22 |
139 | READDIR | REQ | | Section 18.23 | 138 | READDIR | REQ | | Section 18.23 |
140 | READLINK | OPT | | Section 18.24 | 139 | READLINK | OPT | | Section 18.24 |
141NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | 140 | RECLAIM_COMPLETE | REQ | | Section 18.51 |
142 | RELEASE_LOCKOWNER | MNI | | N/A | 141 | RELEASE_LOCKOWNER | MNI | | N/A |
143 | REMOVE | REQ | | Section 18.25 | 142 | REMOVE | REQ | | Section 18.25 |
144 | RENAME | REQ | | Section 18.26 | 143 | RENAME | REQ | | Section 18.26 |
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea6808..ebcaaee21616 100644
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'.
185request/response format 185request/response format
186----------------------- 186-----------------------
187 187
188While each cache is free to use it's own format for requests 188While each cache is free to use its own format for requests
189and responses over channel, the following is recommended as 189and responses over channel, the following is recommended as
190appropriate and support routines are available to help: 190appropriate and support routines are available to help:
191Each request or response record should be printable ASCII 191Each request or response record should be printable ASCII
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 839efd8a8a8c..d3e7673995eb 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -50,8 +50,8 @@ NILFS2 supports the following mount options:
50(*) == default 50(*) == default
51 51
52nobarrier Disables barriers. 52nobarrier Disables barriers.
53errors=continue(*) Keep going on a filesystem error. 53errors=continue Keep going on a filesystem error.
54errors=remount-ro Remount the filesystem read-only on an error. 54errors=remount-ro(*) Remount the filesystem read-only on an error.
55errors=panic Panic and halt the machine if an error occurs. 55errors=panic Panic and halt the machine if an error occurs.
56cp=n Specify the checkpoint-number of the snapshot to be 56cp=n Specify the checkpoint-number of the snapshot to be
57 mounted. Checkpoints and snapshots are listed by lscp 57 mounted. Checkpoints and snapshots are listed by lscp
@@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount.
74 This disables every write access on the device for 74 This disables every write access on the device for
75 read-only mounts or snapshots. This option will fail 75 read-only mounts or snapshots. This option will fail
76 for r/w mounts on an unclean volume. 76 for r/w mounts on an unclean volume.
77discard Issue discard/TRIM commands to the underlying block
78 device when blocks are freed. This is useful for SSD
79 devices and sparse/thinly-provisioned LUNs.
77 80
78NILFS2 usage 81NILFS2 usage
79============ 82============
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c58b9f5ba002..1f7ae144f6d8 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -80,3 +80,10 @@ user_xattr (*) Enables Extended User Attributes.
80nouser_xattr Disables Extended User Attributes. 80nouser_xattr Disables Extended User Attributes.
81acl Enables POSIX Access Control Lists support. 81acl Enables POSIX Access Control Lists support.
82noacl (*) Disables POSIX Access Control Lists support. 82noacl (*) Disables POSIX Access Control Lists support.
83resv_level=2 (*) Set how agressive allocation reservations will be.
84 Valid values are between 0 (reservations off) to 8
85 (maximum space for reservations).
86dir_resv_level= (*) By default, directory reservations will scale with file
87 reservations - users should rarely need to change this
88 value. If allocation reservations are turned off, this
89 option will have no effect.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d07513a67a6..9fb6cbe70bde 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -164,6 +164,7 @@ read the file /proc/PID/status:
164 VmExe: 68 kB 164 VmExe: 68 kB
165 VmLib: 1412 kB 165 VmLib: 1412 kB
166 VmPTE: 20 kb 166 VmPTE: 20 kb
167 VmSwap: 0 kB
167 Threads: 1 168 Threads: 1
168 SigQ: 0/28578 169 SigQ: 0/28578
169 SigPnd: 0000000000000000 170 SigPnd: 0000000000000000
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
188contains details information about the process itself. Its fields are 189contains details information about the process itself. Its fields are
189explained in Table 1-4. 190explained in Table 1-4.
190 191
191Table 1-2: Contents of the statm files (as of 2.6.30-rc7) 192(for SMP CONFIG users)
193For making accounting scalable, RSS related information are handled in
194asynchronous manner and the vaule may not be very precise. To see a precise
195snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
196It's slow but very precise.
197
198Table 1-2: Contents of the status files (as of 2.6.30-rc7)
192.............................................................................. 199..............................................................................
193 Field Content 200 Field Content
194 Name filename of the executable 201 Name filename of the executable
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
213 VmExe size of text segment 220 VmExe size of text segment
214 VmLib size of shared library code 221 VmLib size of shared library code
215 VmPTE size of page table entries 222 VmPTE size of page table entries
223 VmSwap size of swap usage (the number of referred swapents)
216 Threads number of threads 224 Threads number of threads
217 SigQ number of signals queued/max. number for queue 225 SigQ number of signals queued/max. number for queue
218 SigPnd bitmap of pending signals for the thread 226 SigPnd bitmap of pending signals for the thread
@@ -297,7 +305,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
297 cgtime guest time of the task children in jiffies 305 cgtime guest time of the task children in jiffies
298.............................................................................. 306..............................................................................
299 307
300The /proc/PID/map file containing the currently mapped memory regions and 308The /proc/PID/maps file containing the currently mapped memory regions and
301their access permissions. 309their access permissions.
302 310
303The format is: 311The format is:
@@ -308,7 +316,7 @@ address perms offset dev inode pathname
30808049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 31608049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3090804a000-0806b000 rw-p 00000000 00:00 0 [heap] 3170804a000-0806b000 rw-p 00000000 00:00 0 [heap]
310a7cb1000-a7cb2000 ---p 00000000 00:00 0 318a7cb1000-a7cb2000 ---p 00000000 00:00 0
311a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] 319a7cb2000-a7eb2000 rw-p 00000000 00:00 0
312a7eb2000-a7eb3000 ---p 00000000 00:00 0 320a7eb2000-a7eb3000 ---p 00000000 00:00 0
313a7eb3000-a7ed5000 rw-p 00000000 00:00 0 321a7eb3000-a7ed5000 rw-p 00000000 00:00 0
314a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 322a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
@@ -344,7 +352,6 @@ is not associated with a file:
344 [stack] = the stack of the main process 352 [stack] = the stack of the main process
345 [vdso] = the "virtual dynamic shared object", 353 [vdso] = the "virtual dynamic shared object",
346 the kernel system call handler 354 the kernel system call handler
347 [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size
348 355
349 or if empty, the mapping is anonymous. 356 or if empty, the mapping is anonymous.
350 357
@@ -430,6 +437,7 @@ Table 1-5: Kernel info in /proc
430 modules List of loaded modules 437 modules List of loaded modules
431 mounts Mounted filesystems 438 mounts Mounted filesystems
432 net Networking info (see text) 439 net Networking info (see text)
440 pagetypeinfo Additional page allocator information (see text) (2.5)
433 partitions Table of partitions known to the system 441 partitions Table of partitions known to the system
434 pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, 442 pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
435 decoupled by lspci (2.4) 443 decoupled by lspci (2.4)
@@ -557,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
557IRQs which have not yet been allocated/activated, and hence which lack a 565IRQs which have not yet been allocated/activated, and hence which lack a
558/proc/irq/[0-9]* directory. 566/proc/irq/[0-9]* directory.
559 567
568The node file on an SMP system shows the node to which the device using the IRQ
569reports itself as being attached. This hardware locality information does not
570include information about any possible driver locality preference.
571
560prof_cpu_mask specifies which CPUs are to be profiled by the system wide 572prof_cpu_mask specifies which CPUs are to be profiled by the system wide
561profiler. Default value is ffffffff (all cpus). 573profiler. Default value is ffffffff (all cpus).
562 574
@@ -584,7 +596,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ...
584Node 0, zone Normal 1 0 0 1 101 8 ... 596Node 0, zone Normal 1 0 0 1 101 8 ...
585Node 0, zone HighMem 2 0 0 1 1 0 ... 597Node 0, zone HighMem 2 0 0 1 1 0 ...
586 598
587Memory fragmentation is a problem under some workloads, and buddyinfo is a 599External fragmentation is a problem under some workloads, and buddyinfo is a
588useful tool for helping diagnose these problems. Buddyinfo will give you a 600useful tool for helping diagnose these problems. Buddyinfo will give you a
589clue as to how big an area you can safely allocate, or why a previous 601clue as to how big an area you can safely allocate, or why a previous
590allocation failed. 602allocation failed.
@@ -594,6 +606,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
594ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 606ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE
595available in ZONE_NORMAL, etc... 607available in ZONE_NORMAL, etc...
596 608
609More information relevant to external fragmentation can be found in
610pagetypeinfo.
611
612> cat /proc/pagetypeinfo
613Page block order: 9
614Pages per block: 512
615
616Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
617Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0
618Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
619Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2
620Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0
621Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
622Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9
623Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0
624Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452
625Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0
626Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
627
628Number of blocks type Unmovable Reclaimable Movable Reserve Isolate
629Node 0, zone DMA 2 0 5 1 0
630Node 0, zone DMA32 41 6 967 2 0
631
632Fragmentation avoidance in the kernel works by grouping pages of different
633migrate types into the same contiguous regions of memory called page blocks.
634A page block is typically the size of the default hugepage size e.g. 2MB on
635X86-64. By keeping pages grouped based on their ability to move, the kernel
636can reclaim pages within a page block to satisfy a high-order allocation.
637
638The pagetypinfo begins with information on the size of a page block. It
639then gives the same type of information as buddyinfo except broken down
640by migrate-type and finishes with details on how many page blocks of each
641type exist.
642
643If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
644from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
645make an estimate of the likely number of huge pages that can be allocated
646at a given point in time. All the "Movable" blocks should be allocatable
647unless memory has been mlock()'d. Some of the Reclaimable blocks should
648also be allocatable although a lot of filesystem metadata may have to be
649reclaimed to achieve this.
650
597.............................................................................. 651..............................................................................
598 652
599meminfo: 653meminfo:
@@ -914,7 +968,7 @@ your system and how much traffic was routed over those devices:
914 ...] 1375103 17405 0 0 0 0 0 0 968 ...] 1375103 17405 0 0 0 0 0 0
915 ...] 1703981 5535 0 0 0 3 0 0 969 ...] 1703981 5535 0 0 0 3 0 0
916 970
917In addition, each Channel Bond interface has it's own directory. For 971In addition, each Channel Bond interface has its own directory. For
918example, the bond0 device will have a directory called /proc/net/bond0/. 972example, the bond0 device will have a directory called /proc/net/bond0/.
919It will contain information that is specific to that bond, such as the 973It will contain information that is specific to that bond, such as the
920current slaves of the bond, the link status of the slaves, and how 974current slaves of the bond, the link status of the slaves, and how
@@ -1311,7 +1365,7 @@ been accounted as having caused 1MB of write.
1311In other words: The number of bytes which this process caused to not happen, 1365In other words: The number of bytes which this process caused to not happen,
1312by truncating pagecache. A task can cause "negative" IO too. If this task 1366by truncating pagecache. A task can cause "negative" IO too. If this task
1313truncates some dirty pagecache, some IO which another task has been accounted 1367truncates some dirty pagecache, some IO which another task has been accounted
1314for (in it's write_bytes) will not be happening. We _could_ just subtract that 1368for (in its write_bytes) will not be happening. We _could_ just subtract that
1315from the truncating task's write_bytes, but there is information loss in doing 1369from the truncating task's write_bytes, but there is information loss in doing
1316that. 1370that.
1317 1371
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f94..fc0e39af43c3 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
837 individual lists does not affect propagation or the way propagation 837 individual lists does not affect propagation or the way propagation
838 tree is modified by operations. 838 tree is modified by operations.
839 839
840 All vfsmounts in a peer group have the same ->mnt_master. If it is
841 non-NULL, they form a contiguous (ordered) segment of slave list.
842
840 A example propagation tree looks as shown in the figure below. 843 A example propagation tree looks as shown in the figure below.
841 [ NOTE: Though it looks like a forest, if we consider all the shared 844 [ NOTE: Though it looks like a forest, if we consider all the shared
842 mounts as a conceptual entity called 'pnode', it becomes a tree] 845 mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
874 877
875 NOTE: The propagation tree is orthogonal to the mount tree. 878 NOTE: The propagation tree is orthogonal to the mount tree.
876 879
8808B Locking:
881
882 ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
883 by namespace_sem (exclusive for modifications, shared for reading).
884
885 Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
886 There are two exceptions: do_add_mount() and clone_mnt().
887 The former modifies a vfsmount that has not been visible in any shared
888 data structures yet.
889 The latter holds namespace_sem and the only references to vfsmount
890 are in lists that can't be traversed without namespace_sem.
877 891
8788B Algorithm: 8928C Algorithm:
879 893
880 The crux of the implementation resides in rbind/move operation. 894 The crux of the implementation resides in rbind/move operation.
881 895
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
index f673ef0de0f7..194fb0decd2c 100644
--- a/Documentation/filesystems/smbfs.txt
+++ b/Documentation/filesystems/smbfs.txt
@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT.
3Smbfs was inspired by Samba, the program written by Andrew Tridgell 3Smbfs was inspired by Samba, the program written by Andrew Tridgell
4that turns any Unix host into a file server for DOS or Windows clients. 4that turns any Unix host into a file server for DOS or Windows clients.
5 5
6Smbfs is a SMB client, but uses parts of samba for it's operation. For 6Smbfs is a SMB client, but uses parts of samba for its operation. For
7more info on samba, including documentation, please go to 7more info on samba, including documentation, please go to
8http://www.samba.org/ and then on to your nearest mirror. 8http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index b324c033035a..203f7202cc9e 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -38,7 +38,8 @@ Hard link support: yes no
38Real inode numbers: yes no 38Real inode numbers: yes no
3932-bit uids/gids: yes no 3932-bit uids/gids: yes no
40File creation time: yes no 40File creation time: yes no
41Xattr and ACL support: no no 41Xattr support: yes no
42ACL support: no no
42 43
43Squashfs compresses data, inodes and directories. In addition, inode and 44Squashfs compresses data, inodes and directories. In addition, inode and
44directory data are highly compacted, and packed on byte boundaries. Each 45directory data are highly compacted, and packed on byte boundaries. Each
@@ -58,7 +59,7 @@ obtained from this site also.
583. SQUASHFS FILESYSTEM DESIGN 593. SQUASHFS FILESYSTEM DESIGN
59----------------------------- 60-----------------------------
60 61
61A squashfs filesystem consists of seven parts, packed together on a byte 62A squashfs filesystem consists of a maximum of eight parts, packed together on a byte
62alignment: 63alignment:
63 64
64 --------------- 65 ---------------
@@ -80,6 +81,9 @@ alignment:
80 |---------------| 81 |---------------|
81 | uid/gid | 82 | uid/gid |
82 | lookup table | 83 | lookup table |
84 |---------------|
85 | xattr |
86 | table |
83 --------------- 87 ---------------
84 88
85Compressed data blocks are written to the filesystem as files are read from 89Compressed data blocks are written to the filesystem as files are read from
@@ -192,6 +196,26 @@ This table is stored compressed into metadata blocks. A second index table is
192used to locate these. This second index table for speed of access (and because 196used to locate these. This second index table for speed of access (and because
193it is small) is read at mount time and cached in memory. 197it is small) is read at mount time and cached in memory.
194 198
1993.7 Xattr table
200---------------
201
202The xattr table contains extended attributes for each inode. The xattrs
203for each inode are stored in a list, each list entry containing a type,
204name and value field. The type field encodes the xattr prefix
205("user.", "trusted." etc) and it also encodes how the name/value fields
206should be interpreted. Currently the type indicates whether the value
207is stored inline (in which case the value field contains the xattr value),
208or if it is stored out of line (in which case the value field stores a
209reference to where the actual value is stored). This allows large values
210to be stored out of line improving scanning and lookup performance and it
211also allows values to be de-duplicated, the value being stored once, and
212all other occurences holding an out of line reference to that value.
213
214The xattr lists are packed into compressed 8K metadata blocks.
215To reduce overhead in inodes, rather than storing the on-disk
216location of the xattr list inside each inode, a 32-bit xattr id
217is stored. This xattr id is mapped into the location of the xattr
218list using a second xattr id lookup table.
195 219
1964. TODOS AND OUTSTANDING ISSUES 2204. TODOS AND OUTSTANDING ISSUES
197------------------------------- 221-------------------------------
@@ -199,9 +223,7 @@ it is small) is read at mount time and cached in memory.
1994.1 Todo list 2234.1 Todo list
200------------- 224-------------
201 225
202Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks 226Implement ACL support.
203for these but the code has not been written. Once the code has been written
204the existing layout should not require modification.
205 227
2064.2 Squashfs internal cache 2284.2 Squashfs internal cache
207--------------------------- 229---------------------------
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt
new file mode 100644
index 000000000000..caaaf1266d8f
--- /dev/null
+++ b/Documentation/filesystems/sysfs-tagging.txt
@@ -0,0 +1,42 @@
1Sysfs tagging
2-------------
3
4(Taken almost verbatim from Eric Biederman's netns tagging patch
5commit msg)
6
7The problem. Network devices show up in sysfs and with the network
8namespace active multiple devices with the same name can show up in
9the same directory, ouch!
10
11To avoid that problem and allow existing applications in network
12namespaces to see the same interface that is currently presented in
13sysfs, sysfs now has tagging directory support.
14
15By using the network namespace pointers as tags to separate out the
16the sysfs directory entries we ensure that we don't have conflicts
17in the directories and applications only see a limited set of
18the network devices.
19
20Each sysfs directory entry may be tagged with zero or one
21namespaces. A sysfs_dirent is augmented with a void *s_ns. If a
22directory entry is tagged, then sysfs_dirent->s_flags will have a
23flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will
24point to the namespace to which it belongs.
25
26Each sysfs superblock's sysfs_super_info contains an array void
27*ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace
28kobj_nstype first mounts sysfs, a new superblock is created. It
29will be differentiated from other sysfs mounts by having its
30s_fs_info->ns[kobj_nstype] set to the new namespace. Note that
31through bind mounting and mounts propagation, a task can easily view
32the contents of other namespaces' sysfs mounts. Therefore, when a
33namespace exits, it will call kobj_ns_exit() to invalidate any
34sysfs_dirent->s_ns pointers pointing to it.
35
36Users of this interface:
37- define a type in the kobj_ns_type enumeration.
38- call kobj_ns_type_register() with its kobj_ns_type_operations which has
39 - current_ns() which returns current's namespace
40 - netlink_ns() which returns a socket's namespace
41 - initial_ns() which returns the initial namesapce
42- call kobj_ns_exit() when an individual tag is no longer valid
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 3015da0c6b2a..98ef55124158 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -82,21 +82,31 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
82all files in that instance (if CONFIG_NUMA is enabled) - which can be 82all files in that instance (if CONFIG_NUMA is enabled) - which can be
83adjusted on the fly via 'mount -o remount ...' 83adjusted on the fly via 'mount -o remount ...'
84 84
85mpol=default prefers to allocate memory from the local node 85mpol=default use the process allocation policy
86 (see set_mempolicy(2))
86mpol=prefer:Node prefers to allocate memory from the given Node 87mpol=prefer:Node prefers to allocate memory from the given Node
87mpol=bind:NodeList allocates memory only from nodes in NodeList 88mpol=bind:NodeList allocates memory only from nodes in NodeList
88mpol=interleave prefers to allocate from each node in turn 89mpol=interleave prefers to allocate from each node in turn
89mpol=interleave:NodeList allocates from each node of NodeList in turn 90mpol=interleave:NodeList allocates from each node of NodeList in turn
91mpol=local prefers to allocate memory from the local node
90 92
91NodeList format is a comma-separated list of decimal numbers and ranges, 93NodeList format is a comma-separated list of decimal numbers and ranges,
92a range being two hyphen-separated decimal numbers, the smallest and 94a range being two hyphen-separated decimal numbers, the smallest and
93largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 95largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15
94 96
97A memory policy with a valid NodeList will be saved, as specified, for
98use at file creation time. When a task allocates a file in the file
99system, the mount option memory policy will be applied with a NodeList,
100if any, modified by the calling task's cpuset constraints
101[See Documentation/cgroups/cpusets.txt] and any optional flags, listed
102below. If the resulting NodeLists is the empty set, the effective memory
103policy for the file will revert to "default" policy.
104
95NUMA memory allocation policies have optional flags that can be used in 105NUMA memory allocation policies have optional flags that can be used in
96conjunction with their modes. These optional flags can be specified 106conjunction with their modes. These optional flags can be specified
97when tmpfs is mounted by appending them to the mode before the NodeList. 107when tmpfs is mounted by appending them to the mode before the NodeList.
98See Documentation/vm/numa_memory_policy.txt for a list of all available 108See Documentation/vm/numa_memory_policy.txt for a list of all available
99memory allocation policy mode flags. 109memory allocation policy mode flags and their effect on memory policy.
100 110
101 =static is equivalent to MPOL_F_STATIC_NODES 111 =static is equivalent to MPOL_F_STATIC_NODES
102 =relative is equivalent to MPOL_F_RELATIVE_NODES 112 =relative is equivalent to MPOL_F_RELATIVE_NODES
@@ -134,3 +144,5 @@ Author:
134 Christoph Rohland <cr@sap.com>, 1.12.01 144 Christoph Rohland <cr@sap.com>, 1.12.01
135Updated: 145Updated:
136 Hugh Dickins, 4 June 2007 146 Hugh Dickins, 4 June 2007
147Updated:
148 KOSAKI Motohiro, 16 Mar 2010
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32edd90..b66858538df5 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file
72descriptors). The freshly allocated file structure is initialized with 72descriptors). The freshly allocated file structure is initialized with
73a pointer to the dentry and a set of file operation member functions. 73a pointer to the dentry and a set of file operation member functions.
74These are taken from the inode data. The open() file method is then 74These are taken from the inode data. The open() file method is then
75called so the specific filesystem implementation can do it's work. You 75called so the specific filesystem implementation can do its work. You
76can see that this is another switch performed by the VFS. The file 76can see that this is another switch performed by the VFS. The file
77structure is placed into the file descriptor table for the process. 77structure is placed into the file descriptor table for the process.
78 78
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
new file mode 100644
index 000000000000..d8119e9d2d60
--- /dev/null
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -0,0 +1,816 @@
1XFS Delayed Logging Design
2--------------------------
3
4Introduction to Re-logging in XFS
5---------------------------------
6
7XFS logging is a combination of logical and physical logging. Some objects,
8such as inodes and dquots, are logged in logical format where the details
9logged are made up of the changes to in-core structures rather than on-disk
10structures. Other objects - typically buffers - have their physical changes
11logged. The reason for these differences is to reduce the amount of log space
12required for objects that are frequently logged. Some parts of inodes are more
13frequently logged than others, and inodes are typically more frequently logged
14than any other object (except maybe the superblock buffer) so keeping the
15amount of metadata logged low is of prime importance.
16
17The reason that this is such a concern is that XFS allows multiple separate
18modifications to a single object to be carried in the log at any given time.
19This allows the log to avoid needing to flush each change to disk before
20recording a new change to the object. XFS does this via a method called
21"re-logging". Conceptually, this is quite simple - all it requires is that any
22new change to the object is recorded with a *new copy* of all the existing
23changes in the new transaction that is written to the log.
24
25That is, if we have a sequence of changes A through to F, and the object was
26written to disk after change D, we would see in the log the following series
27of transactions, their contents and the log sequence number (LSN) of the
28transaction:
29
30 Transaction Contents LSN
31 A A X
32 B A+B X+n
33 C A+B+C X+n+m
34 D A+B+C+D X+n+m+o
35 <object written to disk>
36 E E Y (> X+n+m+o)
37 F E+F YÙ+p
38
39In other words, each time an object is relogged, the new transaction contains
40the aggregation of all the previous changes currently held only in the log.
41
42This relogging technique also allows objects to be moved forward in the log so
43that an object being relogged does not prevent the tail of the log from ever
44moving forward. This can be seen in the table above by the changing
45(increasing) LSN of each subsquent transaction - the LSN is effectively a
46direct encoding of the location in the log of the transaction.
47
48This relogging is also used to implement long-running, multiple-commit
49transactions. These transaction are known as rolling transactions, and require
50a special log reservation known as a permanent transaction reservation. A
51typical example of a rolling transaction is the removal of extents from an
52inode which can only be done at a rate of two extents per transaction because
53of reservation size limitations. Hence a rolling extent removal transaction
54keeps relogging the inode and btree buffers as they get modified in each
55removal operation. This keeps them moving forward in the log as the operation
56progresses, ensuring that current operation never gets blocked by itself if the
57log wraps around.
58
59Hence it can be seen that the relogging operation is fundamental to the correct
60working of the XFS journalling subsystem. From the above description, most
61people should be able to see why the XFS metadata operations writes so much to
62the log - repeated operations to the same objects write the same changes to
63the log over and over again. Worse is the fact that objects tend to get
64dirtier as they get relogged, so each subsequent transaction is writing more
65metadata into the log.
66
67Another feature of the XFS transaction subsystem is that most transactions are
68asynchronous. That is, they don't commit to disk until either a log buffer is
69filled (a log buffer can hold multiple transactions) or a synchronous operation
70forces the log buffers holding the transactions to disk. This means that XFS is
71doing aggregation of transactions in memory - batching them, if you like - to
72minimise the impact of the log IO on transaction throughput.
73
74The limitation on asynchronous transaction throughput is the number and size of
75log buffers made available by the log manager. By default there are 8 log
76buffers available and the size of each is 32kB - the size can be increased up
77to 256kB by use of a mount option.
78
79Effectively, this gives us the maximum bound of outstanding metadata changes
80that can be made to the filesystem at any point in time - if all the log
81buffers are full and under IO, then no more transactions can be committed until
82the current batch completes. It is now common for a single current CPU core to
83be to able to issue enough transactions to keep the log buffers full and under
84IO permanently. Hence the XFS journalling subsystem can be considered to be IO
85bound.
86
87Delayed Logging: Concepts
88-------------------------
89
90The key thing to note about the asynchronous logging combined with the
91relogging technique XFS uses is that we can be relogging changed objects
92multiple times before they are committed to disk in the log buffers. If we
93return to the previous relogging example, it is entirely possible that
94transactions A through D are committed to disk in the same log buffer.
95
96That is, a single log buffer may contain multiple copies of the same object,
97but only one of those copies needs to be there - the last one "D", as it
98contains all the changes from the previous changes. In other words, we have one
99necessary copy in the log buffer, and three stale copies that are simply
100wasting space. When we are doing repeated operations on the same set of
101objects, these "stale objects" can be over 90% of the space used in the log
102buffers. It is clear that reducing the number of stale objects written to the
103log would greatly reduce the amount of metadata we write to the log, and this
104is the fundamental goal of delayed logging.
105
106From a conceptual point of view, XFS is already doing relogging in memory (where
107memory == log buffer), only it is doing it extremely inefficiently. It is using
108logical to physical formatting to do the relogging because there is no
109infrastructure to keep track of logical changes in memory prior to physically
110formatting the changes in a transaction to the log buffer. Hence we cannot avoid
111accumulating stale objects in the log buffers.
112
113Delayed logging is the name we've given to keeping and tracking transactional
114changes to objects in memory outside the log buffer infrastructure. Because of
115the relogging concept fundamental to the XFS journalling subsystem, this is
116actually relatively easy to do - all the changes to logged items are already
117tracked in the current infrastructure. The big problem is how to accumulate
118them and get them to the log in a consistent, recoverable manner.
119Describing the problems and how they have been solved is the focus of this
120document.
121
122One of the key changes that delayed logging makes to the operation of the
123journalling subsystem is that it disassociates the amount of outstanding
124metadata changes from the size and number of log buffers available. In other
125words, instead of there only being a maximum of 2MB of transaction changes not
126written to the log at any point in time, there may be a much greater amount
127being accumulated in memory. Hence the potential for loss of metadata on a
128crash is much greater than for the existing logging mechanism.
129
130It should be noted that this does not change the guarantee that log recovery
131will result in a consistent filesystem. What it does mean is that as far as the
132recovered filesystem is concerned, there may be many thousands of transactions
133that simply did not occur as a result of the crash. This makes it even more
134important that applications that care about their data use fsync() where they
135need to ensure application level data integrity is maintained.
136
137It should be noted that delayed logging is not an innovative new concept that
138warrants rigorous proofs to determine whether it is correct or not. The method
139of accumulating changes in memory for some period before writing them to the
140log is used effectively in many filesystems including ext3 and ext4. Hence
141no time is spent in this document trying to convince the reader that the
142concept is sound. Instead it is simply considered a "solved problem" and as
143such implementing it in XFS is purely an exercise in software engineering.
144
145The fundamental requirements for delayed logging in XFS are simple:
146
147 1. Reduce the amount of metadata written to the log by at least
148 an order of magnitude.
149 2. Supply sufficient statistics to validate Requirement #1.
150 3. Supply sufficient new tracing infrastructure to be able to debug
151 problems with the new code.
152 4. No on-disk format change (metadata or log format).
153 5. Enable and disable with a mount option.
154 6. No performance regressions for synchronous transaction workloads.
155
156Delayed Logging: Design
157-----------------------
158
159Storing Changes
160
161The problem with accumulating changes at a logical level (i.e. just using the
162existing log item dirty region tracking) is that when it comes to writing the
163changes to the log buffers, we need to ensure that the object we are formatting
164is not changing while we do this. This requires locking the object to prevent
165concurrent modification. Hence flushing the logical changes to the log would
166require us to lock every object, format them, and then unlock them again.
167
168This introduces lots of scope for deadlocks with transactions that are already
169running. For example, a transaction has object A locked and modified, but needs
170the delayed logging tracking lock to commit the transaction. However, the
171flushing thread has the delayed logging tracking lock already held, and is
172trying to get the lock on object A to flush it to the log buffer. This appears
173to be an unsolvable deadlock condition, and it was solving this problem that
174was the barrier to implementing delayed logging for so long.
175
176The solution is relatively simple - it just took a long time to recognise it.
177Put simply, the current logging code formats the changes to each item into an
178vector array that points to the changed regions in the item. The log write code
179simply copies the memory these vectors point to into the log buffer during
180transaction commit while the item is locked in the transaction. Instead of
181using the log buffer as the destination of the formatting code, we can use an
182allocated memory buffer big enough to fit the formatted vector.
183
184If we then copy the vector into the memory buffer and rewrite the vector to
185point to the memory buffer rather than the object itself, we now have a copy of
186the changes in a format that is compatible with the log buffer writing code.
187that does not require us to lock the item to access. This formatting and
188rewriting can all be done while the object is locked during transaction commit,
189resulting in a vector that is transactionally consistent and can be accessed
190without needing to lock the owning item.
191
192Hence we avoid the need to lock items when we need to flush outstanding
193asynchronous transactions to the log. The differences between the existing
194formatting method and the delayed logging formatting can be seen in the
195diagram below.
196
197Current format log vector:
198
199Object +---------------------------------------------+
200Vector 1 +----+
201Vector 2 +----+
202Vector 3 +----------+
203
204After formatting:
205
206Log Buffer +-V1-+-V2-+----V3----+
207
208Delayed logging vector:
209
210Object +---------------------------------------------+
211Vector 1 +----+
212Vector 2 +----+
213Vector 3 +----------+
214
215After formatting:
216
217Memory Buffer +-V1-+-V2-+----V3----+
218Vector 1 +----+
219Vector 2 +----+
220Vector 3 +----------+
221
222The memory buffer and associated vector need to be passed as a single object,
223but still need to be associated with the parent object so if the object is
224relogged we can replace the current memory buffer with a new memory buffer that
225contains the latest changes.
226
227The reason for keeping the vector around after we've formatted the memory
228buffer is to support splitting vectors across log buffer boundaries correctly.
229If we don't keep the vector around, we do not know where the region boundaries
230are in the item, so we'd need a new encapsulation method for regions in the log
231buffer writing (i.e. double encapsulation). This would be an on-disk format
232change and as such is not desirable. It also means we'd have to write the log
233region headers in the formatting stage, which is problematic as there is per
234region state that needs to be placed into the headers during the log write.
235
236Hence we need to keep the vector, but by attaching the memory buffer to it and
237rewriting the vector addresses to point at the memory buffer we end up with a
238self-describing object that can be passed to the log buffer write code to be
239handled in exactly the same manner as the existing log vectors are handled.
240Hence we avoid needing a new on-disk format to handle items that have been
241relogged in memory.
242
243
244Tracking Changes
245
246Now that we can record transactional changes in memory in a form that allows
247them to be used without limitations, we need to be able to track and accumulate
248them so that they can be written to the log at some later point in time. The
249log item is the natural place to store this vector and buffer, and also makes sense
250to be the object that is used to track committed objects as it will always
251exist once the object has been included in a transaction.
252
253The log item is already used to track the log items that have been written to
254the log but not yet written to disk. Such log items are considered "active"
255and as such are stored in the Active Item List (AIL) which is a LSN-ordered
256double linked list. Items are inserted into this list during log buffer IO
257completion, after which they are unpinned and can be written to disk. An object
258that is in the AIL can be relogged, which causes the object to be pinned again
259and then moved forward in the AIL when the log buffer IO completes for that
260transaction.
261
262Essentially, this shows that an item that is in the AIL can still be modified
263and relogged, so any tracking must be separate to the AIL infrastructure. As
264such, we cannot reuse the AIL list pointers for tracking committed items, nor
265can we store state in any field that is protected by the AIL lock. Hence the
266committed item tracking needs it's own locks, lists and state fields in the log
267item.
268
269Similar to the AIL, tracking of committed items is done through a new list
270called the Committed Item List (CIL). The list tracks log items that have been
271committed and have formatted memory buffers attached to them. It tracks objects
272in transaction commit order, so when an object is relogged it is removed from
273it's place in the list and re-inserted at the tail. This is entirely arbitrary
274and done to make it easy for debugging - the last items in the list are the
275ones that are most recently modified. Ordering of the CIL is not necessary for
276transactional integrity (as discussed in the next section) so the ordering is
277done for convenience/sanity of the developers.
278
279
280Delayed Logging: Checkpoints
281
282When we have a log synchronisation event, commonly known as a "log force",
283all the items in the CIL must be written into the log via the log buffers.
284We need to write these items in the order that they exist in the CIL, and they
285need to be written as an atomic transaction. The need for all the objects to be
286written as an atomic transaction comes from the requirements of relogging and
287log replay - all the changes in all the objects in a given transaction must
288either be completely replayed during log recovery, or not replayed at all. If
289a transaction is not replayed because it is not complete in the log, then
290no later transactions should be replayed, either.
291
292To fulfill this requirement, we need to write the entire CIL in a single log
293transaction. Fortunately, the XFS log code has no fixed limit on the size of a
294transaction, nor does the log replay code. The only fundamental limit is that
295the transaction cannot be larger than just under half the size of the log. The
296reason for this limit is that to find the head and tail of the log, there must
297be at least one complete transaction in the log at any given time. If a
298transaction is larger than half the log, then there is the possibility that a
299crash during the write of a such a transaction could partially overwrite the
300only complete previous transaction in the log. This will result in a recovery
301failure and an inconsistent filesystem and hence we must enforce the maximum
302size of a checkpoint to be slightly less than a half the log.
303
304Apart from this size requirement, a checkpoint transaction looks no different
305to any other transaction - it contains a transaction header, a series of
306formatted log items and a commit record at the tail. From a recovery
307perspective, the checkpoint transaction is also no different - just a lot
308bigger with a lot more items in it. The worst case effect of this is that we
309might need to tune the recovery transaction object hash size.
310
311Because the checkpoint is just another transaction and all the changes to log
312items are stored as log vectors, we can use the existing log buffer writing
313code to write the changes into the log. To do this efficiently, we need to
314minimise the time we hold the CIL locked while writing the checkpoint
315transaction. The current log write code enables us to do this easily with the
316way it separates the writing of the transaction contents (the log vectors) from
317the transaction commit record, but tracking this requires us to have a
318per-checkpoint context that travels through the log write process through to
319checkpoint completion.
320
321Hence a checkpoint has a context that tracks the state of the current
322checkpoint from initiation to checkpoint completion. A new context is initiated
323at the same time a checkpoint transaction is started. That is, when we remove
324all the current items from the CIL during a checkpoint operation, we move all
325those changes into the current checkpoint context. We then initialise a new
326context and attach that to the CIL for aggregation of new transactions.
327
328This allows us to unlock the CIL immediately after transfer of all the
329committed items and effectively allow new transactions to be issued while we
330are formatting the checkpoint into the log. It also allows concurrent
331checkpoints to be written into the log buffers in the case of log force heavy
332workloads, just like the existing transaction commit code does. This, however,
333requires that we strictly order the commit records in the log so that
334checkpoint sequence order is maintained during log replay.
335
336To ensure that we can be writing an item into a checkpoint transaction at
337the same time another transaction modifies the item and inserts the log item
338into the new CIL, then checkpoint transaction commit code cannot use log items
339to store the list of log vectors that need to be written into the transaction.
340Hence log vectors need to be able to be chained together to allow them to be
341detatched from the log items. That is, when the CIL is flushed the memory
342buffer and log vector attached to each log item needs to be attached to the
343checkpoint context so that the log item can be released. In diagrammatic form,
344the CIL would look like this before the flush:
345
346 CIL Head
347 |
348 V
349 Log Item <-> log vector 1 -> memory buffer
350 | -> vector array
351 V
352 Log Item <-> log vector 2 -> memory buffer
353 | -> vector array
354 V
355 ......
356 |
357 V
358 Log Item <-> log vector N-1 -> memory buffer
359 | -> vector array
360 V
361 Log Item <-> log vector N -> memory buffer
362 -> vector array
363
364And after the flush the CIL head is empty, and the checkpoint context log
365vector list would look like:
366
367 Checkpoint Context
368 |
369 V
370 log vector 1 -> memory buffer
371 | -> vector array
372 | -> Log Item
373 V
374 log vector 2 -> memory buffer
375 | -> vector array
376 | -> Log Item
377 V
378 ......
379 |
380 V
381 log vector N-1 -> memory buffer
382 | -> vector array
383 | -> Log Item
384 V
385 log vector N -> memory buffer
386 -> vector array
387 -> Log Item
388
389Once this transfer is done, the CIL can be unlocked and new transactions can
390start, while the checkpoint flush code works over the log vector chain to
391commit the checkpoint.
392
393Once the checkpoint is written into the log buffers, the checkpoint context is
394attached to the log buffer that the commit record was written to along with a
395completion callback. Log IO completion will call that callback, which can then
396run transaction committed processing for the log items (i.e. insert into AIL
397and unpin) in the log vector chain and then free the log vector chain and
398checkpoint context.
399
400Discussion Point: I am uncertain as to whether the log item is the most
401efficient way to track vectors, even though it seems like the natural way to do
402it. The fact that we walk the log items (in the CIL) just to chain the log
403vectors and break the link between the log item and the log vector means that
404we take a cache line hit for the log item list modification, then another for
405the log vector chaining. If we track by the log vectors, then we only need to
406break the link between the log item and the log vector, which means we should
407dirty only the log item cachelines. Normally I wouldn't be concerned about one
408vs two dirty cachelines except for the fact I've seen upwards of 80,000 log
409vectors in one checkpoint transaction. I'd guess this is a "measure and
410compare" situation that can be done after a working and reviewed implementation
411is in the dev tree....
412
413Delayed Logging: Checkpoint Sequencing
414
415One of the key aspects of the XFS transaction subsystem is that it tags
416committed transactions with the log sequence number of the transaction commit.
417This allows transactions to be issued asynchronously even though there may be
418future operations that cannot be completed until that transaction is fully
419committed to the log. In the rare case that a dependent operation occurs (e.g.
420re-using a freed metadata extent for a data extent), a special, optimised log
421force can be issued to force the dependent transaction to disk immediately.
422
423To do this, transactions need to record the LSN of the commit record of the
424transaction. This LSN comes directly from the log buffer the transaction is
425written into. While this works just fine for the existing transaction
426mechanism, it does not work for delayed logging because transactions are not
427written directly into the log buffers. Hence some other method of sequencing
428transactions is required.
429
430As discussed in the checkpoint section, delayed logging uses per-checkpoint
431contexts, and as such it is simple to assign a sequence number to each
432checkpoint. Because the switching of checkpoint contexts must be done
433atomically, it is simple to ensure that each new context has a monotonically
434increasing sequence number assigned to it without the need for an external
435atomic counter - we can just take the current context sequence number and add
436one to it for the new context.
437
438Then, instead of assigning a log buffer LSN to the transaction commit LSN
439during the commit, we can assign the current checkpoint sequence. This allows
440operations that track transactions that have not yet completed know what
441checkpoint sequence needs to be committed before they can continue. As a
442result, the code that forces the log to a specific LSN now needs to ensure that
443the log forces to a specific checkpoint.
444
445To ensure that we can do this, we need to track all the checkpoint contexts
446that are currently committing to the log. When we flush a checkpoint, the
447context gets added to a "committing" list which can be searched. When a
448checkpoint commit completes, it is removed from the committing list. Because
449the checkpoint context records the LSN of the commit record for the checkpoint,
450we can also wait on the log buffer that contains the commit record, thereby
451using the existing log force mechanisms to execute synchronous forces.
452
453It should be noted that the synchronous forces may need to be extended with
454mitigation algorithms similar to the current log buffer code to allow
455aggregation of multiple synchronous transactions if there are already
456synchronous transactions being flushed. Investigation of the performance of the
457current design is needed before making any decisions here.
458
459The main concern with log forces is to ensure that all the previous checkpoints
460are also committed to disk before the one we need to wait for. Therefore we
461need to check that all the prior contexts in the committing list are also
462complete before waiting on the one we need to complete. We do this
463synchronisation in the log force code so that we don't need to wait anywhere
464else for such serialisation - it only matters when we do a log force.
465
466The only remaining complexity is that a log force now also has to handle the
467case where the forcing sequence number is the same as the current context. That
468is, we need to flush the CIL and potentially wait for it to complete. This is a
469simple addition to the existing log forcing code to check the sequence numbers
470and push if required. Indeed, placing the current sequence checkpoint flush in
471the log force code enables the current mechanism for issuing synchronous
472transactions to remain untouched (i.e. commit an asynchronous transaction, then
473force the log at the LSN of that transaction) and so the higher level code
474behaves the same regardless of whether delayed logging is being used or not.
475
476Delayed Logging: Checkpoint Log Space Accounting
477
478The big issue for a checkpoint transaction is the log space reservation for the
479transaction. We don't know how big a checkpoint transaction is going to be
480ahead of time, nor how many log buffers it will take to write out, nor the
481number of split log vector regions are going to be used. We can track the
482amount of log space required as we add items to the commit item list, but we
483still need to reserve the space in the log for the checkpoint.
484
485A typical transaction reserves enough space in the log for the worst case space
486usage of the transaction. The reservation accounts for log record headers,
487transaction and region headers, headers for split regions, buffer tail padding,
488etc. as well as the actual space for all the changed metadata in the
489transaction. While some of this is fixed overhead, much of it is dependent on
490the size of the transaction and the number of regions being logged (the number
491of log vectors in the transaction).
492
493An example of the differences would be logging directory changes versus logging
494inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then
495there are lots of transactions that only contain an inode core and an inode log
496format structure. That is, two vectors totaling roughly 150 bytes. If we modify
49710,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each
498vector is 12 bytes, so the total to be logged is approximately 1.75MB. In
499comparison, if we are logging full directory buffers, they are typically 4KB
500each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a
501buffer format structure for each buffer - roughly 800 vectors or 1.51MB total
502space. From this, it should be obvious that a static log space reservation is
503not particularly flexible and is difficult to select the "optimal value" for
504all workloads.
505
506Further, if we are going to use a static reservation, which bit of the entire
507reservation does it cover? We account for space used by the transaction
508reservation by tracking the space currently used by the object in the CIL and
509then calculating the increase or decrease in space used as the object is
510relogged. This allows for a checkpoint reservation to only have to account for
511log buffer metadata used such as log header records.
512
513However, even using a static reservation for just the log metadata is
514problematic. Typically log record headers use at least 16KB of log space per
5151MB of log space consumed (512 bytes per 32k) and the reservation needs to be
516large enough to handle arbitrary sized checkpoint transactions. This
517reservation needs to be made before the checkpoint is started, and we need to
518be able to reserve the space without sleeping. For a 8MB checkpoint, we need a
519reservation of around 150KB, which is a non-trivial amount of space.
520
521A static reservation needs to manipulate the log grant counters - we can take a
522permanent reservation on the space, but we still need to make sure we refresh
523the write reservation (the actual space available to the transaction) after
524every checkpoint transaction completion. Unfortunately, if this space is not
525available when required, then the regrant code will sleep waiting for it.
526
527The problem with this is that it can lead to deadlocks as we may need to commit
528checkpoints to be able to free up log space (refer back to the description of
529rolling transactions for an example of this). Hence we *must* always have
530space available in the log if we are to use static reservations, and that is
531very difficult and complex to arrange. It is possible to do, but there is a
532simpler way.
533
534The simpler way of doing this is tracking the entire log space used by the
535items in the CIL and using this to dynamically calculate the amount of log
536space required by the log metadata. If this log metadata space changes as a
537result of a transaction commit inserting a new memory buffer into the CIL, then
538the difference in space required is removed from the transaction that causes
539the change. Transactions at this level will *always* have enough space
540available in their reservation for this as they have already reserved the
541maximal amount of log metadata space they require, and such a delta reservation
542will always be less than or equal to the maximal amount in the reservation.
543
544Hence we can grow the checkpoint transaction reservation dynamically as items
545are added to the CIL and avoid the need for reserving and regranting log space
546up front. This avoids deadlocks and removes a blocking point from the
547checkpoint flush code.
548
549As mentioned early, transactions can't grow to more than half the size of the
550log. Hence as part of the reservation growing, we need to also check the size
551of the reservation against the maximum allowed transaction size. If we reach
552the maximum threshold, we need to push the CIL to the log. This is effectively
553a "background flush" and is done on demand. This is identical to
554a CIL push triggered by a log force, only that there is no waiting for the
555checkpoint commit to complete. This background push is checked and executed by
556transaction commit code.
557
558If the transaction subsystem goes idle while we still have items in the CIL,
559they will be flushed by the periodic log force issued by the xfssyncd. This log
560force will push the CIL to disk, and if the transaction subsystem stays idle,
561allow the idle log to be covered (effectively marked clean) in exactly the same
562manner that is done for the existing logging method. A discussion point is
563whether this log force needs to be done more frequently than the current rate
564which is once every 30s.
565
566
567Delayed Logging: Log Item Pinning
568
569Currently log items are pinned during transaction commit while the items are
570still locked. This happens just after the items are formatted, though it could
571be done any time before the items are unlocked. The result of this mechanism is
572that items get pinned once for every transaction that is committed to the log
573buffers. Hence items that are relogged in the log buffers will have a pin count
574for every outstanding transaction they were dirtied in. When each of these
575transactions is completed, they will unpin the item once. As a result, the item
576only becomes unpinned when all the transactions complete and there are no
577pending transactions. Thus the pinning and unpinning of a log item is symmetric
578as there is a 1:1 relationship with transaction commit and log item completion.
579
580For delayed logging, however, we have an assymetric transaction commit to
581completion relationship. Every time an object is relogged in the CIL it goes
582through the commit process without a corresponding completion being registered.
583That is, we now have a many-to-one relationship between transaction commit and
584log item completion. The result of this is that pinning and unpinning of the
585log items becomes unbalanced if we retain the "pin on transaction commit, unpin
586on transaction completion" model.
587
588To keep pin/unpin symmetry, the algorithm needs to change to a "pin on
589insertion into the CIL, unpin on checkpoint completion". In other words, the
590pinning and unpinning becomes symmetric around a checkpoint context. We have to
591pin the object the first time it is inserted into the CIL - if it is already in
592the CIL during a transaction commit, then we do not pin it again. Because there
593can be multiple outstanding checkpoint contexts, we can still see elevated pin
594counts, but as each checkpoint completes the pin count will retain the correct
595value according to it's context.
596
597Just to make matters more slightly more complex, this checkpoint level context
598for the pin count means that the pinning of an item must take place under the
599CIL commit/flush lock. If we pin the object outside this lock, we cannot
600guarantee which context the pin count is associated with. This is because of
601the fact pinning the item is dependent on whether the item is present in the
602current CIL or not. If we don't pin the CIL first before we check and pin the
603object, we have a race with CIL being flushed between the check and the pin
604(or not pinning, as the case may be). Hence we must hold the CIL flush/commit
605lock to guarantee that we pin the items correctly.
606
607Delayed Logging: Concurrent Scalability
608
609A fundamental requirement for the CIL is that accesses through transaction
610commits must scale to many concurrent commits. The current transaction commit
611code does not break down even when there are transactions coming from 2048
612processors at once. The current transaction code does not go any faster than if
613there was only one CPU using it, but it does not slow down either.
614
615As a result, the delayed logging transaction commit code needs to be designed
616for concurrency from the ground up. It is obvious that there are serialisation
617points in the design - the three important ones are:
618
619 1. Locking out new transaction commits while flushing the CIL
620 2. Adding items to the CIL and updating item space accounting
621 3. Checkpoint commit ordering
622
623Looking at the transaction commit and CIL flushing interactions, it is clear
624that we have a many-to-one interaction here. That is, the only restriction on
625the number of concurrent transactions that can be trying to commit at once is
626the amount of space available in the log for their reservations. The practical
627limit here is in the order of several hundred concurrent transactions for a
628128MB log, which means that it is generally one per CPU in a machine.
629
630The amount of time a transaction commit needs to hold out a flush is a
631relatively long period of time - the pinning of log items needs to be done
632while we are holding out a CIL flush, so at the moment that means it is held
633across the formatting of the objects into memory buffers (i.e. while memcpy()s
634are in progress). Ultimately a two pass algorithm where the formatting is done
635separately to the pinning of objects could be used to reduce the hold time of
636the transaction commit side.
637
638Because of the number of potential transaction commit side holders, the lock
639really needs to be a sleeping lock - if the CIL flush takes the lock, we do not
640want every other CPU in the machine spinning on the CIL lock. Given that
641flushing the CIL could involve walking a list of tens of thousands of log
642items, it will get held for a significant time and so spin contention is a
643significant concern. Preventing lots of CPUs spinning doing nothing is the
644main reason for choosing a sleeping lock even though nothing in either the
645transaction commit or CIL flush side sleeps with the lock held.
646
647It should also be noted that CIL flushing is also a relatively rare operation
648compared to transaction commit for asynchronous transaction workloads - only
649time will tell if using a read-write semaphore for exclusion will limit
650transaction commit concurrency due to cache line bouncing of the lock on the
651read side.
652
653The second serialisation point is on the transaction commit side where items
654are inserted into the CIL. Because transactions can enter this code
655concurrently, the CIL needs to be protected separately from the above
656commit/flush exclusion. It also needs to be an exclusive lock but it is only
657held for a very short time and so a spin lock is appropriate here. It is
658possible that this lock will become a contention point, but given the short
659hold time once per transaction I think that contention is unlikely.
660
661The final serialisation point is the checkpoint commit record ordering code
662that is run as part of the checkpoint commit and log force sequencing. The code
663path that triggers a CIL flush (i.e. whatever triggers the log force) will enter
664an ordering loop after writing all the log vectors into the log buffers but
665before writing the commit record. This loop walks the list of committing
666checkpoints and needs to block waiting for checkpoints to complete their commit
667record write. As a result it needs a lock and a wait variable. Log force
668sequencing also requires the same lock, list walk, and blocking mechanism to
669ensure completion of checkpoints.
670
671These two sequencing operations can use the mechanism even though the
672events they are waiting for are different. The checkpoint commit record
673sequencing needs to wait until checkpoint contexts contain a commit LSN
674(obtained through completion of a commit record write) while log force
675sequencing needs to wait until previous checkpoint contexts are removed from
676the committing list (i.e. they've completed). A simple wait variable and
677broadcast wakeups (thundering herds) has been used to implement these two
678serialisation queues. They use the same lock as the CIL, too. If we see too
679much contention on the CIL lock, or too many context switches as a result of
680the broadcast wakeups these operations can be put under a new spinlock and
681given separate wait lists to reduce lock contention and the number of processes
682woken by the wrong event.
683
684
685Lifecycle Changes
686
687The existing log item life cycle is as follows:
688
689 1. Transaction allocate
690 2. Transaction reserve
691 3. Lock item
692 4. Join item to transaction
693 If not already attached,
694 Allocate log item
695 Attach log item to owner item
696 Attach log item to transaction
697 5. Modify item
698 Record modifications in log item
699 6. Transaction commit
700 Pin item in memory
701 Format item into log buffer
702 Write commit LSN into transaction
703 Unlock item
704 Attach transaction to log buffer
705
706 <log buffer IO dispatched>
707 <log buffer IO completes>
708
709 7. Transaction completion
710 Mark log item committed
711 Insert log item into AIL
712 Write commit LSN into log item
713 Unpin log item
714 8. AIL traversal
715 Lock item
716 Mark log item clean
717 Flush item to disk
718
719 <item IO completion>
720
721 9. Log item removed from AIL
722 Moves log tail
723 Item unlocked
724
725Essentially, steps 1-6 operate independently from step 7, which is also
726independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9
727at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur
728at the same time. If the log item is in the AIL or between steps 6 and 7
729and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9
730are entered and completed is the object considered clean.
731
732With delayed logging, there are new steps inserted into the life cycle:
733
734 1. Transaction allocate
735 2. Transaction reserve
736 3. Lock item
737 4. Join item to transaction
738 If not already attached,
739 Allocate log item
740 Attach log item to owner item
741 Attach log item to transaction
742 5. Modify item
743 Record modifications in log item
744 6. Transaction commit
745 Pin item in memory if not pinned in CIL
746 Format item into log vector + buffer
747 Attach log vector and buffer to log item
748 Insert log item into CIL
749 Write CIL context sequence into transaction
750 Unlock item
751
752 <next log force>
753
754 7. CIL push
755 lock CIL flush
756 Chain log vectors and buffers together
757 Remove items from CIL
758 unlock CIL flush
759 write log vectors into log
760 sequence commit records
761 attach checkpoint context to log buffer
762
763 <log buffer IO dispatched>
764 <log buffer IO completes>
765
766 8. Checkpoint completion
767 Mark log item committed
768 Insert item into AIL
769 Write commit LSN into log item
770 Unpin log item
771 9. AIL traversal
772 Lock item
773 Mark log item clean
774 Flush item to disk
775 <item IO completion>
776 10. Log item removed from AIL
777 Moves log tail
778 Item unlocked
779
780From this, it can be seen that the only life cycle differences between the two
781logging methods are in the middle of the life cycle - they still have the same
782beginning and end and execution constraints. The only differences are in the
783commiting of the log items to the log itself and the completion processing.
784Hence delayed logging should not introduce any constraints on log item
785behaviour, allocation or freeing that don't already exist.
786
787As a result of this zero-impact "insertion" of delayed logging infrastructure
788and the design of the internal structures to avoid on disk format changes, we
789can basically switch between delayed logging and the existing mechanism with a
790mount option. Fundamentally, there is no reason why the log manager would not
791be able to swap methods automatically and transparently depending on load
792characteristics, but this should not be necessary if delayed logging works as
793designed.
794
795Roadmap:
796
7972.6.35 Inclusion in mainline as an experimental mount option
798 => approximately 2-3 months to merge window
799 => needs to be in xfs-dev tree in 4-6 weeks
800 => code is nearing readiness for review
801
8022.6.37 Remove experimental tag from mount option
803 => should be roughly 6 months after initial merge
804 => enough time to:
805 => gain confidence and fix problems reported by early
806 adopters (a.k.a. guinea pigs)
807 => address worst performance regressions and undesired
808 behaviours
809 => start tuning/optimising code for parallelism
810 => start tuning/optimising algorithms consuming
811 excessive CPU time
812
8132.6.39 Switch default mount option to use delayed logging
814 => should be roughly 12 months after initial merge
815 => enough time to shake out remaining problems before next round of
816 enterprise distro kernel rebases
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 1866c27eec69..c2c6e9b39bbe 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -253,6 +253,70 @@ pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
253Also note that it's your responsibility to have stopped using a GPIO 253Also note that it's your responsibility to have stopped using a GPIO
254before you free it. 254before you free it.
255 255
256Considering in most cases GPIOs are actually configured right after they
257are claimed, three additional calls are defined:
258
259 /* request a single GPIO, with initial configuration specified by
260 * 'flags', identical to gpio_request() wrt other arguments and
261 * return value
262 */
263 int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
264
265 /* request multiple GPIOs in a single call
266 */
267 int gpio_request_array(struct gpio *array, size_t num);
268
269 /* release multiple GPIOs in a single call
270 */
271 void gpio_free_array(struct gpio *array, size_t num);
272
273where 'flags' is currently defined to specify the following properties:
274
275 * GPIOF_DIR_IN - to configure direction as input
276 * GPIOF_DIR_OUT - to configure direction as output
277
278 * GPIOF_INIT_LOW - as output, set initial level to LOW
279 * GPIOF_INIT_HIGH - as output, set initial level to HIGH
280
281since GPIOF_INIT_* are only valid when configured as output, so group valid
282combinations as:
283
284 * GPIOF_IN - configure as input
285 * GPIOF_OUT_INIT_LOW - configured as output, initial level LOW
286 * GPIOF_OUT_INIT_HIGH - configured as output, initial level HIGH
287
288In the future, these flags can be extended to support more properties such
289as open-drain status.
290
291Further more, to ease the claim/release of multiple GPIOs, 'struct gpio' is
292introduced to encapsulate all three fields as:
293
294 struct gpio {
295 unsigned gpio;
296 unsigned long flags;
297 const char *label;
298 };
299
300A typical example of usage:
301
302 static struct gpio leds_gpios[] = {
303 { 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* default to ON */
304 { 33, GPIOF_OUT_INIT_LOW, "Green LED" }, /* default to OFF */
305 { 34, GPIOF_OUT_INIT_LOW, "Red LED" }, /* default to OFF */
306 { 35, GPIOF_OUT_INIT_LOW, "Blue LED" }, /* default to OFF */
307 { ... },
308 };
309
310 err = gpio_request_one(31, GPIOF_IN, "Reset Button");
311 if (err)
312 ...
313
314 err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
315 if (err)
316 ...
317
318 gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
319
256 320
257GPIOs mapped to IRQs 321GPIOs mapped to IRQs
258-------------------- 322--------------------
diff --git a/Documentation/hwmon/abituguru b/Documentation/hwmon/abituguru
index 87ffa0f5ec70..5eb3b9d5f0d5 100644
--- a/Documentation/hwmon/abituguru
+++ b/Documentation/hwmon/abituguru
@@ -30,7 +30,7 @@ Supported chips:
30 bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1 30 bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1
31 You may also need to specify the fan_sensors option for these boards 31 You may also need to specify the fan_sensors option for these boards
32 fan_sensors=5 32 fan_sensors=5
33 2) There is a seperate abituguru3 driver for these motherboards, 33 2) There is a separate abituguru3 driver for these motherboards,
34 the abituguru (without the 3 !) driver will not work on these 34 the abituguru (without the 3 !) driver will not work on these
35 motherboards (and visa versa)! 35 motherboards (and visa versa)!
36 36
diff --git a/Documentation/hwmon/adt7411 b/Documentation/hwmon/adt7411
new file mode 100644
index 000000000000..1632960f9745
--- /dev/null
+++ b/Documentation/hwmon/adt7411
@@ -0,0 +1,42 @@
1Kernel driver adt7411
2=====================
3
4Supported chips:
5 * Analog Devices ADT7411
6 Prefix: 'adt7411'
7 Addresses scanned: 0x48, 0x4a, 0x4b
8 Datasheet: Publicly available at the Analog Devices website
9
10Author: Wolfram Sang (based on adt7470 by Darrick J. Wong)
11
12Description
13-----------
14
15This driver implements support for the Analog Devices ADT7411 chip. There may
16be other chips that implement this interface.
17
18The ADT7411 can use an I2C/SMBus compatible 2-wire interface or an
19SPI-compatible 4-wire interface. It provides a 10-bit analog to digital
20converter which measures 1 temperature, vdd and 8 input voltages. It has an
21internal temperature sensor, but an external one can also be connected (one
22loses 2 inputs then). There are high- and low-limit registers for all inputs.
23
24Check the datasheet for details.
25
26sysfs-Interface
27---------------
28
29in0_input - vdd voltage input
30in[1-8]_input - analog 1-8 input
31temp1_input - temperature input
32
33Besides standard interfaces, this driver adds (0 = off, 1 = on):
34
35 adc_ref_vdd - Use vdd as reference instead of 2.25 V
36 fast_sampling - Sample at 22.5 kHz instead of 1.4 kHz, but drop filters
37 no_average - Turn off averaging over 16 samples
38
39Notes
40-----
41
42SPI, external temperature sensor and limit registers are not supported yet.
diff --git a/Documentation/hwmon/adt7473 b/Documentation/hwmon/adt7473
deleted file mode 100644
index 446612bd1fb9..000000000000
--- a/Documentation/hwmon/adt7473
+++ /dev/null
@@ -1,74 +0,0 @@
1Kernel driver adt7473
2======================
3
4Supported chips:
5 * Analog Devices ADT7473
6 Prefix: 'adt7473'
7 Addresses scanned: I2C 0x2C, 0x2D, 0x2E
8 Datasheet: Publicly available at the Analog Devices website
9
10Author: Darrick J. Wong
11
12This driver is depreacted, please use the adt7475 driver instead.
13
14Description
15-----------
16
17This driver implements support for the Analog Devices ADT7473 chip family.
18
19The ADT7473 uses the 2-wire interface compatible with the SMBUS 2.0
20specification. Using an analog to digital converter it measures three (3)
21temperatures and two (2) voltages. It has four (4) 16-bit counters for
22measuring fan speed. There are three (3) PWM outputs that can be used
23to control fan speed.
24
25A sophisticated control system for the PWM outputs is designed into the
26ADT7473 that allows fan speed to be adjusted automatically based on any of the
27three temperature sensors. Each PWM output is individually adjustable and
28programmable. Once configured, the ADT7473 will adjust the PWM outputs in
29response to the measured temperatures without further host intervention.
30This feature can also be disabled for manual control of the PWM's.
31
32Each of the measured inputs (voltage, temperature, fan speed) has
33corresponding high/low limit values. The ADT7473 will signal an ALARM if
34any measured value exceeds either limit.
35
36The ADT7473 samples all inputs continuously. The driver will not read
37the registers more often than once every other second. Further,
38configuration data is only read once per minute.
39
40Special Features
41----------------
42
43The ADT7473 have a 10-bit ADC and can therefore measure temperatures
44with 0.25 degC resolution. Temperature readings can be configured either
45for twos complement format or "Offset 64" format, wherein 63 is subtracted
46from the raw value to get the temperature value.
47
48The Analog Devices datasheet is very detailed and describes a procedure for
49determining an optimal configuration for the automatic PWM control.
50
51Configuration Notes
52-------------------
53
54Besides standard interfaces driver adds the following:
55
56* PWM Control
57
58* pwm#_auto_point1_pwm and temp#_auto_point1_temp and
59* pwm#_auto_point2_pwm and temp#_auto_point2_temp -
60
61point1: Set the pwm speed at a lower temperature bound.
62point2: Set the pwm speed at a higher temperature bound.
63
64The ADT7473 will scale the pwm between the lower and higher pwm speed when
65the temperature is between the two temperature boundaries. PWM values range
66from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the
67temperature sensor associated with the PWM control exceeds temp#_max.
68
69Notes
70-----
71
72The NVIDIA binary driver presents an ADT7473 chip via an on-card i2c bus.
73Unfortunately, they fail to set the i2c adapter class, so this driver may
74fail to find the chip until the nvidia driver is patched.
diff --git a/Documentation/hwmon/asc7621 b/Documentation/hwmon/asc7621
new file mode 100644
index 000000000000..7287be7e1f21
--- /dev/null
+++ b/Documentation/hwmon/asc7621
@@ -0,0 +1,296 @@
1Kernel driver asc7621
2==================
3
4Supported chips:
5 Andigilog aSC7621 and aSC7621a
6 Prefix: 'asc7621'
7 Addresses scanned: I2C 0x2c, 0x2d, 0x2e
8 Datasheet: http://www.fairview5.com/linux/asc7621/asc7621.pdf
9
10Author:
11 George Joseph
12
13Description provided by Dave Pivin @ Andigilog:
14
15Andigilog has both the PECI and pre-PECI versions of the Heceta-6, as
16Intel calls them. Heceta-6e has high frequency PWM and Heceta-6p has
17added PECI and a 4th thermal zone. The Andigilog aSC7611 is the
18Heceta-6e part and aSC7621 is the Heceta-6p part. They are both in
19volume production, shipping to Intel and their subs.
20
21We have enhanced both parts relative to the governing Intel
22specification. First enhancement is temperature reading resolution. We
23have used registers below 20h for vendor-specific functions in addition
24to those in the Intel-specified vendor range.
25
26Our conversion process produces a result that is reported as two bytes.
27The fan speed control uses this finer value to produce a "step-less" fan
28PWM output. These two bytes are "read-locked" to guarantee that once a
29high or low byte is read, the other byte is locked-in until after the
30next read of any register. So to get an atomic reading, read high or low
31byte, then the very next read should be the opposite byte. Our data
32sheet says 10-bits of resolution, although you may find the lower bits
33are active, they are not necessarily reliable or useful externally. We
34chose not to mask them.
35
36We employ significant filtering that is user tunable as described in the
37data sheet. Our temperature reports and fan PWM outputs are very smooth
38when compared to the competition, in addition to the higher resolution
39temperature reports. The smoother PWM output does not require user
40intervention.
41
42We offer GPIO features on the former VID pins. These are open-drain
43outputs or inputs and may be used as general purpose I/O or as alarm
44outputs that are based on temperature limits. These are in 19h and 1Ah.
45
46We offer flexible mapping of temperature readings to thermal zones. Any
47temperature may be mapped to any zone, which has a default assignment
48that follows Intel's specs.
49
50Since there is a fan to zone assignment that allows for the "hotter" of
51a set of zones to control the PWM of an individual fan, but there is no
52indication to the user, we have added an indicator that shows which zone
53is currently controlling the PWM for a given fan. This is in register
5400h.
55
56Both remote diode temperature readings may be given an offset value such
57that the reported reading as well as the temperature used to determine
58PWM may be offset for system calibration purposes.
59
60PECI Extended configuration allows for having more than two domains per
61PECI address and also provides an enabling function for each PECI
62address. One could use our flexible zone assignment to have a zone
63assigned to up to 4 PECI addresses. This is not possible in the default
64Intel configuration. This would be useful in multi-CPU systems with
65individual fans on each that would benefit from individual fan control.
66This is in register 0Eh.
67
68The tachometer measurement system is flexible and able to adapt to many
69fan types. We can also support pulse-stretched PWM so that 3-wire fans
70may be used. These characteristics are in registers 04h to 07h.
71
72Finally, we have added a tach disable function that turns off the tach
73measurement system for individual tachs in order to save power. That is
74in register 75h.
75
76--
77aSC7621 Product Description
78
79The aSC7621 has a two wire digital interface compatible with SMBus 2.0.
80Using a 10-bit ADC, the aSC7621 measures the temperature of two remote diode
81connected transistors as well as its own die. Support for Platform
82Environmental Control Interface (PECI) is included.
83
84Using temperature information from these four zones, an automatic fan speed
85control algorithm is employed to minimize acoustic impact while achieving
86recommended CPU temperature under varying operational loads.
87
88To set fan speed, the aSC7621 has three independent pulse width modulation
89(PWM) outputs that are controlled by one, or a combination of three,
90temperature zones. Both high- and low-frequency PWM ranges are supported.
91
92The aSC7621 also includes a digital filter that can be invoked to smooth
93temperature readings for better control of fan speed and minimum acoustic
94impact.
95
96The aSC7621 has tachometer inputs to measure fan speed on up to four fans.
97Limit and status registers for all measured values are included to alert
98the system host that any measurements are outside of programmed limits
99via status registers.
100
101System voltages of VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard power are
102monitored efficiently with internal scaling resistors.
103
104Features
105- Supports PECI interface and monitors internal and remote thermal diodes
106- 2-wire, SMBus 2.0 compliant, serial interface
107- 10-bit ADC
108- Monitors VCCP, 2.5V, 3.3V, 5.0V, and 12V motherboard/processor supplies
109- Programmable autonomous fan control based on temperature readings
110- Noise filtering of temperature reading for fan speed control
111- 0.25C digital temperature sensor resolution
112- 3 PWM fan speed control outputs for 2-, 3- or 4-wire fans and up to 4 fan
113 tachometer inputs
114- Enhanced measured temperature to Temperature Zone assignment.
115- Provides high and low PWM frequency ranges
116- 3 GPIO pins for custom use
117- 24-Lead QSOP package
118
119Configuration Notes
120===================
121
122Except where noted below, the sysfs entries created by this driver follow
123the standards defined in "sysfs-interface".
124
125temp1_source
126 0 (default) peci_legacy = 0, Remote 1 Temperature
127 peci_legacy = 1, PECI Processor Temperature 0
128 1 Remote 1 Temperature
129 2 Remote 2 Temperature
130 3 Internal Temperature
131 4 PECI Processor Temperature 0
132 5 PECI Processor Temperature 1
133 6 PECI Processor Temperature 2
134 7 PECI Processor Temperature 3
135
136temp2_source
137 0 (default) Internal Temperature
138 1 Remote 1 Temperature
139 2 Remote 2 Temperature
140 3 Internal Temperature
141 4 PECI Processor Temperature 0
142 5 PECI Processor Temperature 1
143 6 PECI Processor Temperature 2
144 7 PECI Processor Temperature 3
145
146temp3_source
147 0 (default) Remote 2 Temperature
148 1 Remote 1 Temperature
149 2 Remote 2 Temperature
150 3 Internal Temperature
151 4 PECI Processor Temperature 0
152 5 PECI Processor Temperature 1
153 6 PECI Processor Temperature 2
154 7 PECI Processor Temperature 3
155
156temp4_source
157 0 (default) peci_legacy = 0, PECI Processor Temperature 0
158 peci_legacy = 1, Remote 1 Temperature
159 1 Remote 1 Temperature
160 2 Remote 2 Temperature
161 3 Internal Temperature
162 4 PECI Processor Temperature 0
163 5 PECI Processor Temperature 1
164 6 PECI Processor Temperature 2
165 7 PECI Processor Temperature 3
166
167temp[1-4]_smoothing_enable
168temp[1-4]_smoothing_time
169 Smooths spikes in temp readings caused by noise.
170 Valid values in milliseconds are:
171 35000
172 17600
173 11800
174 7000
175 4400
176 3000
177 1600
178 800
179
180temp[1-4]_crit
181 When the corresponding zone temperature reaches this value,
182 ALL pwm outputs will got to 100%.
183
184temp[5-8]_input
185temp[5-8]_enable
186 The aSC7621 can also read temperatures provided by the processor
187 via the PECI bus. Usually these are "core" temps and are relative
188 to the point where the automatic thermal control circuit starts
189 throttling. This means that these are usually negative numbers.
190
191pwm[1-3]_enable
192 0 Fan off.
193 1 Fan on manual control.
194 2 Fan on automatic control and will run at the minimum pwm
195 if the temperature for the zone is below the minimum.
196 3 Fan on automatic control but will be off if the temperature
197 for the zone is below the minimum.
198 4-254 Ignored.
199 255 Fan on full.
200
201pwm[1-3]_auto_channels
202 Bitmap as described in sysctl-interface with the following
203 exceptions...
204 Only the following combination of zones (and their corresponding masks)
205 are valid:
206 1
207 2
208 3
209 2,3
210 1,2,3
211 4
212 1,2,3,4
213
214 Special values:
215 0 Disabled.
216 16 Fan on manual control.
217 31 Fan on full.
218
219
220pwm[1-3]_invert
221 When set, inverts the meaning of pwm[1-3].
222 i.e. when pwm = 0, the fan will be on full and
223 when pwm = 255 the fan will be off.
224
225pwm[1-3]_freq
226 PWM frequency in Hz
227 Valid values in Hz are:
228
229 10
230 15
231 23
232 30 (default)
233 38
234 47
235 62
236 94
237 23000
238 24000
239 25000
240 26000
241 27000
242 28000
243 29000
244 30000
245
246 Setting any other value will be ignored.
247
248peci_enable
249 Enables or disables PECI
250
251peci_avg
252 Input filter average time.
253
254 0 0 Sec. (no Smoothing) (default)
255 1 0.25 Sec.
256 2 0.5 Sec.
257 3 1.0 Sec.
258 4 2.0 Sec.
259 5 4.0 Sec.
260 6 8.0 Sec.
261 7 0.0 Sec.
262
263peci_legacy
264
265 0 Standard Mode (default)
266 Remote Diode 1 reading is associated with
267 Temperature Zone 1, PECI is associated with
268 Zone 4
269
270 1 Legacy Mode
271 PECI is associated with Temperature Zone 1,
272 Remote Diode 1 is associated with Zone 4
273
274peci_diode
275 Diode filter
276
277 0 0.25 Sec.
278 1 1.1 Sec.
279 2 2.4 Sec. (default)
280 3 3.4 Sec.
281 4 5.0 Sec.
282 5 6.8 Sec.
283 6 10.2 Sec.
284 7 16.4 Sec.
285
286peci_4domain
287 Four domain enable
288
289 0 1 or 2 Domains for enabled processors (default)
290 1 3 or 4 Domains for enabled processors
291
292peci_domain
293 Domain
294
295 0 Processor contains a single domain (0) (default)
296 1 Processor contains two domains (0,1)
diff --git a/Documentation/hwmon/it87 b/Documentation/hwmon/it87
index f9ba96c0ac4a..8d08bf0d38ed 100644
--- a/Documentation/hwmon/it87
+++ b/Documentation/hwmon/it87
@@ -5,31 +5,23 @@ Supported chips:
5 * IT8705F 5 * IT8705F
6 Prefix: 'it87' 6 Prefix: 'it87'
7 Addresses scanned: from Super I/O config space (8 I/O ports) 7 Addresses scanned: from Super I/O config space (8 I/O ports)
8 Datasheet: Publicly available at the ITE website 8 Datasheet: Once publicly available at the ITE website, but no longer
9 http://www.ite.com.tw/product_info/file/pc/IT8705F_V.0.4.1.pdf
10 * IT8712F 9 * IT8712F
11 Prefix: 'it8712' 10 Prefix: 'it8712'
12 Addresses scanned: from Super I/O config space (8 I/O ports) 11 Addresses scanned: from Super I/O config space (8 I/O ports)
13 Datasheet: Publicly available at the ITE website 12 Datasheet: Once publicly available at the ITE website, but no longer
14 http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.1.pdf
15 http://www.ite.com.tw/product_info/file/pc/Errata%20V0.1%20for%20IT8712F%20V0.9.1.pdf
16 http://www.ite.com.tw/product_info/file/pc/IT8712F_V0.9.3.pdf
17 * IT8716F/IT8726F 13 * IT8716F/IT8726F
18 Prefix: 'it8716' 14 Prefix: 'it8716'
19 Addresses scanned: from Super I/O config space (8 I/O ports) 15 Addresses scanned: from Super I/O config space (8 I/O ports)
20 Datasheet: Publicly available at the ITE website 16 Datasheet: Once publicly available at the ITE website, but no longer
21 http://www.ite.com.tw/product_info/file/pc/IT8716F_V0.3.ZIP
22 http://www.ite.com.tw/product_info/file/pc/IT8726F_V0.3.pdf
23 * IT8718F 17 * IT8718F
24 Prefix: 'it8718' 18 Prefix: 'it8718'
25 Addresses scanned: from Super I/O config space (8 I/O ports) 19 Addresses scanned: from Super I/O config space (8 I/O ports)
26 Datasheet: Publicly available at the ITE website 20 Datasheet: Once publicly available at the ITE website, but no longer
27 http://www.ite.com.tw/product_info/file/pc/IT8718F_V0.2.zip
28 http://www.ite.com.tw/product_info/file/pc/IT8718F_V0%203_(for%20C%20version).zip
29 * IT8720F 21 * IT8720F
30 Prefix: 'it8720' 22 Prefix: 'it8720'
31 Addresses scanned: from Super I/O config space (8 I/O ports) 23 Addresses scanned: from Super I/O config space (8 I/O ports)
32 Datasheet: Not yet publicly available. 24 Datasheet: Not publicly available
33 * SiS950 [clone of IT8705F] 25 * SiS950 [clone of IT8705F]
34 Prefix: 'it87' 26 Prefix: 'it87'
35 Addresses scanned: from Super I/O config space (8 I/O ports) 27 Addresses scanned: from Super I/O config space (8 I/O ports)
@@ -136,6 +128,10 @@ registers are read whenever any data is read (unless it is less than 1.5
136seconds since the last update). This means that you can easily miss 128seconds since the last update). This means that you can easily miss
137once-only alarms. 129once-only alarms.
138 130
131Out-of-limit readings can also result in beeping, if the chip is properly
132wired and configured. Beeping can be enabled or disabled per sensor type
133(temperatures, voltages and fans.)
134
139The IT87xx only updates its values each 1.5 seconds; reading it more often 135The IT87xx only updates its values each 1.5 seconds; reading it more often
140will do no harm, but will return 'old' values. 136will do no harm, but will return 'old' values.
141 137
@@ -150,11 +146,38 @@ Fan speed control
150----------------- 146-----------------
151 147
152The fan speed control features are limited to manual PWM mode. Automatic 148The fan speed control features are limited to manual PWM mode. Automatic
153"Smart Guardian" mode control handling is not implemented. However 149"Smart Guardian" mode control handling is only implemented for older chips
154if you want to go for "manual mode" just write 1 to pwmN_enable. 150(see below.) However if you want to go for "manual mode" just write 1 to
151pwmN_enable.
155 152
156If you are only able to control the fan speed with very small PWM values, 153If you are only able to control the fan speed with very small PWM values,
157try lowering the PWM base frequency (pwm1_freq). Depending on the fan, 154try lowering the PWM base frequency (pwm1_freq). Depending on the fan,
158it may give you a somewhat greater control range. The same frequency is 155it may give you a somewhat greater control range. The same frequency is
159used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are 156used to drive all fan outputs, which is why pwm2_freq and pwm3_freq are
160read-only. 157read-only.
158
159
160Automatic fan speed control (old interface)
161-------------------------------------------
162
163The driver supports the old interface to automatic fan speed control
164which is implemented by IT8705F chips up to revision F and IT8712F
165chips up to revision G.
166
167This interface implements 4 temperature vs. PWM output trip points.
168The PWM output of trip point 4 is always the maximum value (fan running
169at full speed) while the PWM output of the other 3 trip points can be
170freely chosen. The temperature of all 4 trip points can be freely chosen.
171Additionally, trip point 1 has an hysteresis temperature attached, to
172prevent fast switching between fan on and off.
173
174The chip automatically computes the PWM output value based on the input
175temperature, based on this simple rule: if the temperature value is
176between trip point N and trip point N+1 then the PWM output value is
177the one of trip point N. The automatic control mode is less flexible
178than the manual control mode, but it reacts faster, is more robust and
179doesn't use CPU cycles.
180
181Trip points must be set properly before switching to automatic fan speed
182control mode. The driver will perform basic integrity checks before
183actually switching to automatic control mode.
diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85
index a13680871bc7..a76aefeeb68a 100644
--- a/Documentation/hwmon/lm85
+++ b/Documentation/hwmon/lm85
@@ -157,7 +157,7 @@ temperature configuration points:
157 157
158There are three PWM outputs. The LM85 datasheet suggests that the 158There are three PWM outputs. The LM85 datasheet suggests that the
159pwm3 output control both fan3 and fan4. Each PWM can be individually 159pwm3 output control both fan3 and fan4. Each PWM can be individually
160configured and assigned to a zone for it's control value. Each PWM can be 160configured and assigned to a zone for its control value. Each PWM can be
161configured individually according to the following options. 161configured individually according to the following options.
162 162
163* pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off 163* pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off
diff --git a/Documentation/hwmon/lm90 b/Documentation/hwmon/lm90
index 93d8e3d55150..6a03dd4bcc94 100644
--- a/Documentation/hwmon/lm90
+++ b/Documentation/hwmon/lm90
@@ -84,6 +84,10 @@ Supported chips:
84 Addresses scanned: I2C 0x4c 84 Addresses scanned: I2C 0x4c
85 Datasheet: Publicly available at the Maxim website 85 Datasheet: Publicly available at the Maxim website
86 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500 86 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500
87 * Winbond/Nuvoton W83L771AWG/ASG
88 Prefix: 'w83l771'
89 Addresses scanned: I2C 0x4c
90 Datasheet: Not publicly available, can be requested from Nuvoton
87 91
88 92
89Author: Jean Delvare <khali@linux-fr.org> 93Author: Jean Delvare <khali@linux-fr.org>
@@ -147,6 +151,12 @@ MAX6680 and MAX6681:
147 * Selectable address 151 * Selectable address
148 * Remote sensor type selection 152 * Remote sensor type selection
149 153
154W83L771AWG/ASG
155 * The AWG and ASG variants only differ in package format.
156 * Filter and alert configuration register at 0xBF
157 * Diode ideality factor configuration (remote sensor) at 0xE3
158 * Moving average (depending on conversion rate)
159
150All temperature values are given in degrees Celsius. Resolution 160All temperature values are given in degrees Celsius. Resolution
151is 1.0 degree for the local temperature, 0.125 degree for the remote 161is 1.0 degree for the local temperature, 0.125 degree for the remote
152temperature, except for the MAX6657, MAX6658 and MAX6659 which have a 162temperature, except for the MAX6657, MAX6658 and MAX6659 which have a
@@ -163,6 +173,18 @@ The lm90 driver will not update its values more frequently than every
163other second; reading them more often will do no harm, but will return 173other second; reading them more often will do no harm, but will return
164'old' values. 174'old' values.
165 175
176SMBus Alert Support
177-------------------
178
179This driver has basic support for SMBus alert. When an alert is received,
180the status register is read and the faulty temperature channel is logged.
181
182The Analog Devices chips (ADM1032 and ADT7461) do not implement the SMBus
183alert protocol properly so additional care is needed: the ALERT output is
184disabled when an alert is received, and is re-enabled only when the alarm
185is gone. Otherwise the chip would block alerts from other chips in the bus
186as long as the alarm is active.
187
166PEC Support 188PEC Support
167----------- 189-----------
168 190
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
index 81c0c59a60ea..e307914a3eda 100644
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -15,7 +15,8 @@ Supported adapters:
15 * Intel 82801I (ICH9) 15 * Intel 82801I (ICH9)
16 * Intel EP80579 (Tolapai) 16 * Intel EP80579 (Tolapai)
17 * Intel 82801JI (ICH10) 17 * Intel 82801JI (ICH10)
18 * Intel PCH 18 * Intel 3400/5 Series (PCH)
19 * Intel Cougar Point (PCH)
19 Datasheets: Publicly available at the Intel website 20 Datasheets: Publicly available at the Intel website
20 21
21Authors: 22Authors:
@@ -26,7 +27,13 @@ Authors:
26Module Parameters 27Module Parameters
27----------------- 28-----------------
28 29
29None. 30* disable_features (bit vector)
31Disable selected features normally supported by the device. This makes it
32possible to work around possible driver or hardware bugs if the feature in
33question doesn't work as intended for whatever reason. Bit values:
34 1 disable SMBus PEC
35 2 disable the block buffer
36 8 disable the I2C block read functionality
30 37
31 38
32Description 39Description
diff --git a/Documentation/i2c/busses/i2c-parport b/Documentation/i2c/busses/i2c-parport
index dceaba1ad930..2461c7b53b2c 100644
--- a/Documentation/i2c/busses/i2c-parport
+++ b/Documentation/i2c/busses/i2c-parport
@@ -29,6 +29,9 @@ can be easily added when needed.
29Earlier kernels defaulted to type=0 (Philips). But now, if the type 29Earlier kernels defaulted to type=0 (Philips). But now, if the type
30parameter is missing, the driver will simply fail to initialize. 30parameter is missing, the driver will simply fail to initialize.
31 31
32SMBus alert support is available on adapters which have this line properly
33connected to the parallel port's interrupt pin.
34
32 35
33Building your own adapter 36Building your own adapter
34------------------------- 37-------------------------
diff --git a/Documentation/i2c/busses/i2c-parport-light b/Documentation/i2c/busses/i2c-parport-light
index 287436478520..bdc9cbb2e0f2 100644
--- a/Documentation/i2c/busses/i2c-parport-light
+++ b/Documentation/i2c/busses/i2c-parport-light
@@ -9,3 +9,14 @@ parport handling is not an option. The drawback is a reduced portability
9and the impossibility to daisy-chain other parallel port devices. 9and the impossibility to daisy-chain other parallel port devices.
10 10
11Please see i2c-parport for documentation. 11Please see i2c-parport for documentation.
12
13Module parameters:
14
15* type: type of adapter (see i2c-parport or modinfo)
16
17* base: base I/O address
18 Default is 0x378 which is fairly common for parallel ports, at least on PC.
19
20* irq: optional IRQ
21 This must be passed if you want SMBus alert support, assuming your adapter
22 actually supports this.
diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol
index 9df47441f0e7..7c19d1a2bea0 100644
--- a/Documentation/i2c/smbus-protocol
+++ b/Documentation/i2c/smbus-protocol
@@ -185,6 +185,22 @@ the protocol. All ARP communications use slave address 0x61 and
185require PEC checksums. 185require PEC checksums.
186 186
187 187
188SMBus Alert
189===========
190
191SMBus Alert was introduced in Revision 1.0 of the specification.
192
193The SMBus alert protocol allows several SMBus slave devices to share a
194single interrupt pin on the SMBus master, while still allowing the master
195to know which slave triggered the interrupt.
196
197This is implemented the following way in the Linux kernel:
198* I2C bus drivers which support SMBus alert should call
199 i2c_setup_smbus_alert() to setup SMBus alert support.
200* I2C drivers for devices which can trigger SMBus alerts should implement
201 the optional alert() callback.
202
203
188I2C Block Transactions 204I2C Block Transactions
189====================== 205======================
190 206
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 0a74603eb671..5ebf5af1d716 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -74,6 +74,11 @@ structure at all. You should use this to keep device-specific data.
74 /* retrieve the value */ 74 /* retrieve the value */
75 void *i2c_get_clientdata(const struct i2c_client *client); 75 void *i2c_get_clientdata(const struct i2c_client *client);
76 76
77Note that starting with kernel 2.6.34, you don't have to set the `data' field
78to NULL in remove() or if probe() failed anymore. The i2c-core does this
79automatically on these occasions. Those are also the only times the core will
80touch this field.
81
77 82
78Accessing the client 83Accessing the client
79==================== 84====================
@@ -318,8 +323,9 @@ Plain I2C communication
318These routines read and write some bytes from/to a client. The client 323These routines read and write some bytes from/to a client. The client
319contains the i2c address, so you do not have to include it. The second 324contains the i2c address, so you do not have to include it. The second
320parameter contains the bytes to read/write, the third the number of bytes 325parameter contains the bytes to read/write, the third the number of bytes
321to read/write (must be less than the length of the buffer.) Returned is 326to read/write (must be less than the length of the buffer, also should be
322the actual number of bytes read/written. 327less than 64k since msg.len is u16.) Returned is the actual number of bytes
328read/written.
323 329
324 int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg, 330 int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msg,
325 int num); 331 int num);
diff --git a/Documentation/init.txt b/Documentation/init.txt
new file mode 100644
index 000000000000..535ad5e82b98
--- /dev/null
+++ b/Documentation/init.txt
@@ -0,0 +1,49 @@
1Explaining the dreaded "No init found." boot hang message
2=========================================================
3
4OK, so you've got this pretty unintuitive message (currently located
5in init/main.c) and are wondering what the H*** went wrong.
6Some high-level reasons for failure (listed roughly in order of execution)
7to load the init binary are:
8A) Unable to mount root FS
9B) init binary doesn't exist on rootfs
10C) broken console device
11D) binary exists but dependencies not available
12E) binary cannot be loaded
13
14Detailed explanations:
150) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE)
16 to get more detailed kernel messages.
17A) make sure you have the correct root FS type
18 (and root= kernel parameter points to the correct partition),
19 required drivers such as storage hardware (such as SCSI or USB!)
20 and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules,
21 to be pre-loaded by an initrd)
22C) Possibly a conflict in console= setup --> initial console unavailable.
23 E.g. some serial consoles are unreliable due to serial IRQ issues (e.g.
24 missing interrupt-based configuration).
25 Try using a different console= device or e.g. netconsole= .
26D) e.g. required library dependencies of the init binary such as
27 /lib/ld-linux.so.2 missing or broken. Use readelf -d <INIT>|grep NEEDED
28 to find out which libraries are required.
29E) make sure the binary's architecture matches your hardware.
30 E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware.
31 In case you tried loading a non-binary file here (shell script?),
32 you should make sure that the script specifies an interpreter in its shebang
33 header line (#!/...) that is fully working (including its library
34 dependencies). And before tackling scripts, better first test a simple
35 non-script binary such as /bin/sh and confirm its successful execution.
36 To find out more, add code to init/main.c to display kernel_execve()s
37 return values.
38
39Please extend this explanation whenever you find new failure causes
40(after all loading the init binary is a CRITICAL and hard transition step
41which needs to be made as painless as possible), then submit patch to LKML.
42Further TODOs:
43- Implement the various run_init_process() invocations via a struct array
44 which can then store the kernel_execve() result value and on failure
45 log it all by iterating over _all_ results (very important usability fix).
46- try to make the implementation itself more helpful in general,
47 e.g. by providing additional error messages at affected places.
48
49Andreas Mohr <andi at lisas period de>
diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt
index a10c3b6ba7c4..56941ae1f5db 100644
--- a/Documentation/input/elantech.txt
+++ b/Documentation/input/elantech.txt
@@ -333,14 +333,14 @@ byte 0:
333byte 1: 333byte 1:
334 334
335 bit 7 6 5 4 3 2 1 0 335 bit 7 6 5 4 3 2 1 0
336 x15 x14 x13 x12 x11 x10 x9 x8 336 . . . . . x10 x9 x8
337 337
338byte 2: 338byte 2:
339 339
340 bit 7 6 5 4 3 2 1 0 340 bit 7 6 5 4 3 2 1 0
341 x7 x6 x5 x4 x4 x2 x1 x0 341 x7 x6 x5 x4 x4 x2 x1 x0
342 342
343 x15..x0 = absolute x value (horizontal) 343 x10..x0 = absolute x value (horizontal)
344 344
345byte 3: 345byte 3:
346 346
@@ -350,14 +350,14 @@ byte 3:
350byte 4: 350byte 4:
351 351
352 bit 7 6 5 4 3 2 1 0 352 bit 7 6 5 4 3 2 1 0
353 y15 y14 y13 y12 y11 y10 y8 y8 353 . . . . . . y9 y8
354 354
355byte 5: 355byte 5:
356 356
357 bit 7 6 5 4 3 2 1 0 357 bit 7 6 5 4 3 2 1 0
358 y7 y6 y5 y4 y3 y2 y1 y0 358 y7 y6 y5 y4 y3 y2 y1 y0
359 359
360 y15..y0 = absolute y value (vertical) 360 y9..y0 = absolute y value (vertical)
361 361
362 362
3634.2.2 Two finger touch 3634.2.2 Two finger touch
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt
index 154d767b2acb..8007b7ca87bf 100644
--- a/Documentation/input/joystick.txt
+++ b/Documentation/input/joystick.txt
@@ -402,7 +402,7 @@ for the port of the SoundFusion is supported by the cs461x.c module.
402~~~~~~~~~~~~~~~~~~~~~~~~ 402~~~~~~~~~~~~~~~~~~~~~~~~
403 The Live! has a special PCI gameport, which, although it doesn't provide 403 The Live! has a special PCI gameport, which, although it doesn't provide
404any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than 404any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than
405it's ISA counterparts. It also requires special support, hence the 405its ISA counterparts. It also requires special support, hence the
406emu10k1-gp.c module for it instead of the normal ns558.c one. 406emu10k1-gp.c module for it instead of the normal ns558.c one.
407 407
4083.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes 4083.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes
diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt
index 8490480ce432..c0fc1c75fd88 100644
--- a/Documentation/input/multi-touch-protocol.txt
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -68,6 +68,22 @@ like:
68 SYN_MT_REPORT 68 SYN_MT_REPORT
69 SYN_REPORT 69 SYN_REPORT
70 70
71Here is the sequence after lifting one of the fingers:
72
73 ABS_MT_POSITION_X
74 ABS_MT_POSITION_Y
75 SYN_MT_REPORT
76 SYN_REPORT
77
78And here is the sequence after lifting the remaining finger:
79
80 SYN_MT_REPORT
81 SYN_REPORT
82
83If the driver reports one of BTN_TOUCH or ABS_PRESSURE in addition to the
84ABS_MT events, the last SYN_MT_REPORT event may be omitted. Otherwise, the
85last SYN_REPORT will be dropped by the input core, resulting in no
86zero-finger event reaching userland.
71 87
72Event Semantics 88Event Semantics
73--------------- 89---------------
@@ -217,11 +233,6 @@ where examples can be found.
217difference between the contact position and the approaching tool position 233difference between the contact position and the approaching tool position
218could be used to derive tilt. 234could be used to derive tilt.
219[2] The list can of course be extended. 235[2] The list can of course be extended.
220[3] The multi-touch X driver is currently in the prototyping stage. At the 236[3] Multitouch X driver project: http://bitmath.org/code/multitouch/.
221time of writing (April 2009), the MT protocol is not yet merged, and the
222prototype implements finger matching, basic mouse support and two-finger
223scrolling. The project aims at improving the quality of current multi-touch
224functionality available in the Synaptics X driver, and in addition
225implement more advanced gestures.
226[4] See the section on event computation. 237[4] See the section on event computation.
227[5] See the section on finger tracking. 238[5] See the section on finger tracking.
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
index 3a6aec40c0b0..8b4129de1d2d 100644
--- a/Documentation/input/rotary-encoder.txt
+++ b/Documentation/input/rotary-encoder.txt
@@ -75,7 +75,7 @@ and the number of steps or will clamp at the maximum and zero depending on
75the configuration. 75the configuration.
76 76
77Because GPIO to IRQ mapping is platform specific, this information must 77Because GPIO to IRQ mapping is platform specific, this information must
78be given in seperately to the driver. See the example below. 78be given in separately to the driver. See the example below.
79 79
80---------<snip>--------- 80---------<snip>---------
81 81
diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt
index f7160a2fb6a2..b35affd5c649 100644
--- a/Documentation/input/sentelic.txt
+++ b/Documentation/input/sentelic.txt
@@ -1,5 +1,5 @@
1Copyright (C) 2002-2008 Sentelic Corporation. 1Copyright (C) 2002-2010 Sentelic Corporation.
2Last update: Oct-31-2008 2Last update: Jan-13-2010
3 3
4============================================================================== 4==============================================================================
5* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons) 5* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons)
@@ -44,7 +44,7 @@ B) MSID 6: Horizontal and Vertical scrolling.
44Packet 1 44Packet 1
45 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 45 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
46BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| 46BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
47 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|l|r|u|d| 47 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|r|l|u|d|
48 |---------------| |---------------| |---------------| |---------------| 48 |---------------| |---------------| |---------------| |---------------|
49 49
50Byte 1: Bit7 => Y overflow 50Byte 1: Bit7 => Y overflow
@@ -59,15 +59,15 @@ Byte 2: X Movement(9-bit 2's complement integers)
59Byte 3: Y Movement(9-bit 2's complement integers) 59Byte 3: Y Movement(9-bit 2's complement integers)
60Byte 4: Bit0 => the Vertical scrolling movement downward. 60Byte 4: Bit0 => the Vertical scrolling movement downward.
61 Bit1 => the Vertical scrolling movement upward. 61 Bit1 => the Vertical scrolling movement upward.
62 Bit2 => the Vertical scrolling movement rightward. 62 Bit2 => the Horizontal scrolling movement leftward.
63 Bit3 => the Vertical scrolling movement leftward. 63 Bit3 => the Horizontal scrolling movement rightward.
64 Bit4 => 1 = 4th mouse button is pressed, Forward one page. 64 Bit4 => 1 = 4th mouse button is pressed, Forward one page.
65 0 = 4th mouse button is not pressed. 65 0 = 4th mouse button is not pressed.
66 Bit5 => 1 = 5th mouse button is pressed, Backward one page. 66 Bit5 => 1 = 5th mouse button is pressed, Backward one page.
67 0 = 5th mouse button is not pressed. 67 0 = 5th mouse button is not pressed.
68 68
69C) MSID 7: 69C) MSID 7:
70# FSP uses 2 packets(8 Bytes) data to represent Absolute Position 70# FSP uses 2 packets (8 Bytes) to represent Absolute Position.
71 so we have PACKET NUMBER to identify packets. 71 so we have PACKET NUMBER to identify packets.
72 If PACKET NUMBER is 0, the packet is Packet 1. 72 If PACKET NUMBER is 0, the packet is Packet 1.
73 If PACKET NUMBER is 1, the packet is Packet 2. 73 If PACKET NUMBER is 1, the packet is Packet 2.
@@ -129,7 +129,7 @@ Byte 3: Message Type => 0x00 (Disabled)
129Byte 4: Bit7~Bit0 => Don't Care 129Byte 4: Bit7~Bit0 => Don't Care
130 130
131============================================================================== 131==============================================================================
132* Absolute position for STL3888-A0. 132* Absolute position for STL3888-Ax.
133============================================================================== 133==============================================================================
134Packet 1 (ABSOLUTE POSITION) 134Packet 1 (ABSOLUTE POSITION)
135 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 135 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
@@ -179,14 +179,14 @@ Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
179 Bit5~Bit4 => y2_g 179 Bit5~Bit4 => y2_g
180 Bit7~Bit6 => x2_g 180 Bit7~Bit6 => x2_g
181 181
182Notify Packet for STL3888-A0 182Notify Packet for STL3888-Ax
183 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 183 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
184BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| 184BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
185 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| 185 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0|
186 |---------------| |---------------| |---------------| |---------------| 186 |---------------| |---------------| |---------------| |---------------|
187 187
188Byte 1: Bit7~Bit6 => 00, Normal data packet 188Byte 1: Bit7~Bit6 => 00, Normal data packet
189 => 01, Absolute coordination packet 189 => 01, Absolute coordinates packet
190 => 10, Notify packet 190 => 10, Notify packet
191 Bit5 => 1 191 Bit5 => 1
192 Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): 192 Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1):
@@ -205,15 +205,106 @@ Byte 4: Bit7 => scroll right button
205 Bit6 => scroll left button 205 Bit6 => scroll left button
206 Bit5 => scroll down button 206 Bit5 => scroll down button
207 Bit4 => scroll up button 207 Bit4 => scroll up button
208 * Note that if gesture and additional button (Bit4~Bit7) 208 * Note that if gesture and additional buttoni (Bit4~Bit7)
209 happen at the same time, the button information will not 209 happen at the same time, the button information will not
210 be sent. 210 be sent.
211 Bit3~Bit0 => Reserved
212
213Sample sequence of Multi-finger, Multi-coordinate mode:
214
215 notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
216 abs pkt 2, ..., notify packet (valid bit == 0)
217
218==============================================================================
219* Absolute position for STL3888-B0.
220==============================================================================
221Packet 1(ABSOLUTE POSITION)
222 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
223BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
224 1 |0|1|V|F|1|0|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y|
225 |---------------| |---------------| |---------------| |---------------|
226
227Byte 1: Bit7~Bit6 => 00, Normal data packet
228 => 01, Absolute coordinates packet
229 => 10, Notify packet
230 Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
231 When both fingers are up, the last two reports have zero valid
232 bit.
233 Bit4 => finger up/down information. 1: finger down, 0: finger up.
234 Bit3 => 1
235 Bit2 => finger index, 0 is the first finger, 1 is the second finger.
236 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
237 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
238Byte 2: X coordinate (xpos[9:2])
239Byte 3: Y coordinate (ypos[9:2])
240Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
241 Bit3~Bit2 => X coordinate (ypos[1:0])
242 Bit4 => scroll down button
243 Bit5 => scroll up button
244 Bit6 => scroll left button
245 Bit7 => scroll right button
246
247Packet 2 (ABSOLUTE POSITION)
248 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
249BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
250 1 |0|1|V|F|1|1|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y|
251 |---------------| |---------------| |---------------| |---------------|
252
253Byte 1: Bit7~Bit6 => 00, Normal data packet
254 => 01, Absolute coordination packet
255 => 10, Notify packet
256 Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
257 When both fingers are up, the last two reports have zero valid
258 bit.
259 Bit4 => finger up/down information. 1: finger down, 0: finger up.
260 Bit3 => 1
261 Bit2 => finger index, 0 is the first finger, 1 is the second finger.
262 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
263 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
264Byte 2: X coordinate (xpos[9:2])
265Byte 3: Y coordinate (ypos[9:2])
266Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
267 Bit3~Bit2 => X coordinate (ypos[1:0])
268 Bit4 => scroll down button
269 Bit5 => scroll up button
270 Bit6 => scroll left button
271 Bit7 => scroll right button
272
273Notify Packet for STL3888-B0
274 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
275BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
276 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0|
277 |---------------| |---------------| |---------------| |---------------|
278
279Byte 1: Bit7~Bit6 => 00, Normal data packet
280 => 01, Absolute coordination packet
281 => 10, Notify packet
282 Bit5 => 1
283 Bit4 => when in absolute coordinate mode (valid when EN_PKT_GO is 1):
284 0: left button is generated by the on-pad command
285 1: left button is generated by the external button
286 Bit3 => 1
287 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
288 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
289 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
290Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode)
291Byte 3: Bit7~Bit6 => Don't care
292 Bit5~Bit4 => Number of fingers
293 Bit3~Bit1 => Reserved
294 Bit0 => 1: enter gesture mode; 0: leaving gesture mode
295Byte 4: Bit7 => scroll right button
296 Bit6 => scroll left button
297 Bit5 => scroll up button
298 Bit4 => scroll down button
299 * Note that if gesture and additional button(Bit4~Bit7)
300 happen at the same time, the button information will not
301 be sent.
211 Bit3~Bit0 => Reserved 302 Bit3~Bit0 => Reserved
212 303
213Sample sequence of Multi-finger, Multi-coordinate mode: 304Sample sequence of Multi-finger, Multi-coordinate mode:
214 305
215 notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, 306 notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
216 abs pkt 2, ..., notify packet(valid bit == 0) 307 abs pkt 2, ..., notify packet (valid bit == 0)
217 308
218============================================================================== 309==============================================================================
219* FSP Enable/Disable packet 310* FSP Enable/Disable packet
@@ -409,7 +500,8 @@ offset width default r/w name
409 0: read only, 1: read/write enable 500 0: read only, 1: read/write enable
410 (Note that following registers does not require clock gating being 501 (Note that following registers does not require clock gating being
411 enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e 502 enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e
412 40 41 42 43.) 503 40 41 42 43. In addition to that, this bit must be 1 when gesture
504 mode is enabled)
413 505
4140x31 RW on-pad command detection 5060x31 RW on-pad command detection
415 bit7 0 RW on-pad command left button down tag 507 bit7 0 RW on-pad command left button down tag
@@ -463,6 +555,10 @@ offset width default r/w name
463 absolute coordinates; otherwise, host only receives packets with 555 absolute coordinates; otherwise, host only receives packets with
464 relative coordinate.) 556 relative coordinate.)
465 557
558 bit7 0 RW EN_PS2_F2: PS/2 gesture mode 2nd
559 finger packet enable
560 0: disable, 1: enable
561
4660x43 RW on-pad control 5620x43 RW on-pad control
467 bit0 0 RW on-pad control enable 563 bit0 0 RW on-pad control enable
468 0: disable, 1: enable 564 0: disable, 1: enable
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f030019..5dc59b04a71f 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -126,7 +126,7 @@ o Tboot then applies an (optional) user-defined launch policy to
126o Tboot adjusts the e820 table provided by the bootloader to reserve 126o Tboot adjusts the e820 table provided by the bootloader to reserve
127 its own location in memory as well as to reserve certain other 127 its own location in memory as well as to reserve certain other
128 TXT-related regions. 128 TXT-related regions.
129o As part of it's launch, tboot DMA protects all of RAM (using the 129o As part of its launch, tboot DMA protects all of RAM (using the
130 VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' 130 VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on'
131 in order to remove this blanket protection and use VT-d's 131 in order to remove this blanket protection and use VT-d's
132 page-level protection. 132 page-level protection.
@@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT
161 has been restored, it will restore the TPM PCRs and then 161 has been restored, it will restore the TPM PCRs and then
162 transfer control back to the kernel's S3 resume vector. 162 transfer control back to the kernel's S3 resume vector.
163 In order to preserve system integrity across S3, the kernel 163 In order to preserve system integrity across S3, the kernel
164 provides tboot with a set of memory ranges (kernel 164 provides tboot with a set of memory ranges (RAM and RESERVED_KERN
165 code/data/bss, S3 resume code, and AP trampoline) that tboot 165 in the e820 table, but not any memory that BIOS might alter over
166 will calculate a MAC (message authentication code) over and then 166 the S3 transition) that tboot will calculate a MAC (message
167 seal with the TPM. On resume and once the measured environment 167 authentication code) over and then seal with the TPM. On resume
168 has been re-established, tboot will re-calculate the MAC and 168 and once the measured environment has been re-established, tboot
169 verify it against the sealed value. Tboot's policy determines 169 will re-calculate the MAC and verify it against the sealed value.
170 what happens if the verification fails. 170 Tboot's policy determines what happens if the verification fails.
171 Note that the c/s 194 of tboot which has the new MAC code supports
172 this.
171 173
172That's pretty much it for TXT support. 174That's pretty much it for TXT support.
173 175
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35cf64d4436d..dd5806f4fcc4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -139,7 +139,6 @@ Code Seq#(hex) Include File Comments
139'K' all linux/kd.h 139'K' all linux/kd.h
140'L' 00-1F linux/loop.h conflict! 140'L' 00-1F linux/loop.h conflict!
141'L' 10-1F drivers/scsi/mpt2sas/mpt2sas_ctl.h conflict! 141'L' 10-1F drivers/scsi/mpt2sas/mpt2sas_ctl.h conflict!
142'L' 20-2F linux/usb/vstusb.h
143'L' E0-FF linux/ppdd.h encrypted disk device driver 142'L' E0-FF linux/ppdd.h encrypted disk device driver
144 <http://linux01.gwdg.de/~alatham/ppdd.html> 143 <http://linux01.gwdg.de/~alatham/ppdd.html>
145'M' all linux/soundcard.h conflict! 144'M' all linux/soundcard.h conflict!
@@ -292,6 +291,7 @@ Code Seq#(hex) Include File Comments
2920x92 00-0F drivers/usb/mon/mon_bin.c 2910x92 00-0F drivers/usb/mon/mon_bin.c
2930x93 60-7F linux/auto_fs.h 2920x93 60-7F linux/auto_fs.h
2940x94 all fs/btrfs/ioctl.h 2930x94 all fs/btrfs/ioctl.h
2940x97 00-7F fs/ceph/ioctl.h Ceph file system
2950x99 00-0F 537-Addinboard driver 2950x99 00-0F 537-Addinboard driver
296 <mailto:buk@buks.ipn.de> 296 <mailto:buk@buks.ipn.de>
2970xA0 all linux/sdp/sdp.h Industrial Device Project 2970xA0 all linux/sdp/sdp.h Industrial Device Project
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
index 5fe8de5cc727..f172091fb7cd 100644
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -149,10 +149,11 @@ char *(*procinfo)(struct capi_ctr *ctrlr)
149 pointer to a callback function returning the entry for the device in 149 pointer to a callback function returning the entry for the device in
150 the CAPI controller info table, /proc/capi/controller 150 the CAPI controller info table, /proc/capi/controller
151 151
152read_proc_t *ctr_read_proc 152const struct file_operations *proc_fops
153 pointer to the read_proc callback function for the device's proc file 153 pointers to callback functions for the device's proc file
154 system entry, /proc/capi/controllers/<n>; will be called with a 154 system entry, /proc/capi/controllers/<n>; pointer to the device's
155 pointer to the device's capi_ctr structure as the last (data) argument 155 capi_ctr structure is available from struct proc_dir_entry::data
156 which is available from struct inode.
156 157
157Note: Callback functions except send_message() are never called in interrupt 158Note: Callback functions except send_message() are never called in interrupt
158context. 159context.
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 794941fc9493..e472df842323 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -292,10 +292,10 @@ GigaSet 307x Device Driver
292 to /etc/modprobe.d/gigaset, /etc/modprobe.conf.local or a similar file. 292 to /etc/modprobe.d/gigaset, /etc/modprobe.conf.local or a similar file.
293 293
294 Problem: 294 Problem:
295 Your isdn script aborts with a message about isdnlog. 295 The isdnlog program emits error messages or just doesn't work.
296 Solution: 296 Solution:
297 Try deactivating (or commenting out) isdnlog. This driver does not 297 Isdnlog supports only the HiSax driver. Do not attempt to use it with
298 support it. 298 other drivers such as Gigaset.
299 299
300 Problem: 300 Problem:
301 You have two or more DECT data adapters (M101/M105) and only the 301 You have two or more DECT data adapters (M101/M105) and only the
@@ -321,8 +321,8 @@ GigaSet 307x Device Driver
321 writing an appropriate value to /sys/module/gigaset/parameters/debug, e.g. 321 writing an appropriate value to /sys/module/gigaset/parameters/debug, e.g.
322 echo 0 > /sys/module/gigaset/parameters/debug 322 echo 0 > /sys/module/gigaset/parameters/debug
323 switches off debugging output completely, 323 switches off debugging output completely,
324 echo 0x10a020 > /sys/module/gigaset/parameters/debug 324 echo 0x302020 > /sys/module/gigaset/parameters/debug
325 enables the standard set of debugging output messages. These values are 325 enables a reasonable set of debugging output messages. These values are
326 bit patterns where every bit controls a certain type of debugging output. 326 bit patterns where every bit controls a certain type of debugging output.
327 See the constants DEBUG_* in the source file gigaset.h for details. 327 See the constants DEBUG_* in the source file gigaset.h for details.
328 328
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index c412c245848f..b472e4e0ba67 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -181,7 +181,7 @@ Expressions are listed in decreasing order of precedence.
181(7) Returns the result of max(/expr/, /expr/). 181(7) Returns the result of max(/expr/, /expr/).
182 182
183An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 183An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
184respectively for calculations). A menu entry becomes visible when it's 184respectively for calculations). A menu entry becomes visible when its
185expression evaluates to 'm' or 'y'. 185expression evaluates to 'm' or 'y'.
186 186
187There are two types of symbols: constant and non-constant symbols. 187There are two types of symbols: constant and non-constant symbols.
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 49efae703979..b2cb16ebcb16 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -96,7 +96,7 @@ Environment variables for 'silentoldconfig'
96KCONFIG_NOSILENTUPDATE 96KCONFIG_NOSILENTUPDATE
97-------------------------------------------------- 97--------------------------------------------------
98If this variable has a non-blank value, it prevents silent kernel 98If this variable has a non-blank value, it prevents silent kernel
99config udpates (requires explicit updates). 99config updates (requires explicit updates).
100 100
101KCONFIG_AUTOCONFIG 101KCONFIG_AUTOCONFIG
102-------------------------------------------------- 102--------------------------------------------------
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index 28cdc2af2131..ec8d31ee12e0 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -116,7 +116,7 @@
116 Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza. 116 Author: Ingo Molnar, Gadi Oxman and Miguel de Icaza.
117 URL: http://www.linuxjournal.com/article.php?sid=2391 117 URL: http://www.linuxjournal.com/article.php?sid=2391
118 Keywords: RAID, MD driver. 118 Keywords: RAID, MD driver.
119 Description: Linux Journal Kernel Korner article. Here is it's 119 Description: Linux Journal Kernel Korner article. Here is its
120 abstract: "A description of the implementation of the RAID-1, 120 abstract: "A description of the implementation of the RAID-1,
121 RAID-4 and RAID-5 personalities of the MD device driver in the 121 RAID-4 and RAID-5 personalities of the MD device driver in the
122 Linux kernel, providing users with high performance and reliable, 122 Linux kernel, providing users with high performance and reliable,
@@ -127,7 +127,7 @@
127 URL: http://www.linuxjournal.com/article.php?sid=1219 127 URL: http://www.linuxjournal.com/article.php?sid=1219
128 Keywords: device driver, module, loading/unloading modules, 128 Keywords: device driver, module, loading/unloading modules,
129 allocating resources. 129 allocating resources.
130 Description: Linux Journal Kernel Korner article. Here is it's 130 Description: Linux Journal Kernel Korner article. Here is its
131 abstract: "This is the first of a series of four articles 131 abstract: "This is the first of a series of four articles
132 co-authored by Alessandro Rubini and Georg Zezchwitz which present 132 co-authored by Alessandro Rubini and Georg Zezchwitz which present
133 a practical approach to writing Linux device drivers as kernel 133 a practical approach to writing Linux device drivers as kernel
@@ -141,7 +141,7 @@
141 Keywords: character driver, init_module, clean_up module, 141 Keywords: character driver, init_module, clean_up module,
142 autodetection, mayor number, minor number, file operations, 142 autodetection, mayor number, minor number, file operations,
143 open(), close(). 143 open(), close().
144 Description: Linux Journal Kernel Korner article. Here is it's 144 Description: Linux Journal Kernel Korner article. Here is its
145 abstract: "This article, the second of four, introduces part of 145 abstract: "This article, the second of four, introduces part of
146 the actual code to create custom module implementing a character 146 the actual code to create custom module implementing a character
147 device driver. It describes the code for module initialization and 147 device driver. It describes the code for module initialization and
@@ -152,7 +152,7 @@
152 URL: http://www.linuxjournal.com/article.php?sid=1221 152 URL: http://www.linuxjournal.com/article.php?sid=1221
153 Keywords: read(), write(), select(), ioctl(), blocking/non 153 Keywords: read(), write(), select(), ioctl(), blocking/non
154 blocking mode, interrupt handler. 154 blocking mode, interrupt handler.
155 Description: Linux Journal Kernel Korner article. Here is it's 155 Description: Linux Journal Kernel Korner article. Here is its
156 abstract: "This article, the third of four on writing character 156 abstract: "This article, the third of four on writing character
157 device drivers, introduces concepts of reading, writing, and using 157 device drivers, introduces concepts of reading, writing, and using
158 ioctl-calls". 158 ioctl-calls".
@@ -161,7 +161,7 @@
161 Author: Alessandro Rubini and Georg v. Zezschwitz. 161 Author: Alessandro Rubini and Georg v. Zezschwitz.
162 URL: http://www.linuxjournal.com/article.php?sid=1222 162 URL: http://www.linuxjournal.com/article.php?sid=1222
163 Keywords: interrupts, irqs, DMA, bottom halves, task queues. 163 Keywords: interrupts, irqs, DMA, bottom halves, task queues.
164 Description: Linux Journal Kernel Korner article. Here is it's 164 Description: Linux Journal Kernel Korner article. Here is its
165 abstract: "This is the fourth in a series of articles about 165 abstract: "This is the fourth in a series of articles about
166 writing character device drivers as loadable kernel modules. This 166 writing character device drivers as loadable kernel modules. This
167 month, we further investigate the field of interrupt handling. 167 month, we further investigate the field of interrupt handling.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e7848a0d99eb..b56ea860da21 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -54,9 +54,11 @@ parameter is applicable:
54 IMA Integrity measurement architecture is enabled. 54 IMA Integrity measurement architecture is enabled.
55 IOSCHED More than one I/O scheduler is enabled. 55 IOSCHED More than one I/O scheduler is enabled.
56 IP_PNP IP DHCP, BOOTP, or RARP is enabled. 56 IP_PNP IP DHCP, BOOTP, or RARP is enabled.
57 IPV6 IPv6 support is enabled.
57 ISAPNP ISA PnP code is enabled. 58 ISAPNP ISA PnP code is enabled.
58 ISDN Appropriate ISDN support is enabled. 59 ISDN Appropriate ISDN support is enabled.
59 JOY Appropriate joystick support is enabled. 60 JOY Appropriate joystick support is enabled.
61 KGDB Kernel debugger support is enabled.
60 KVM Kernel Virtual Machine support is enabled. 62 KVM Kernel Virtual Machine support is enabled.
61 LIBATA Libata driver is enabled 63 LIBATA Libata driver is enabled
62 LP Printer support is enabled. 64 LP Printer support is enabled.
@@ -98,6 +100,7 @@ parameter is applicable:
98 SWSUSP Software suspend (hibernation) is enabled. 100 SWSUSP Software suspend (hibernation) is enabled.
99 SUSPEND System suspend states are enabled. 101 SUSPEND System suspend states are enabled.
100 FTRACE Function tracing enabled. 102 FTRACE Function tracing enabled.
103 TPM TPM drivers are enabled.
101 TS Appropriate touchscreen support is enabled. 104 TS Appropriate touchscreen support is enabled.
102 UMS USB Mass Storage support is enabled. 105 UMS USB Mass Storage support is enabled.
103 USB USB support is enabled. 106 USB USB support is enabled.
@@ -150,6 +153,7 @@ and is between 256 and 4096 characters. It is defined in the file
150 strict -- Be less tolerant of platforms that are not 153 strict -- Be less tolerant of platforms that are not
151 strictly ACPI specification compliant. 154 strictly ACPI specification compliant.
152 rsdt -- prefer RSDT over (default) XSDT 155 rsdt -- prefer RSDT over (default) XSDT
156 copy_dsdt -- copy DSDT to memory
153 157
154 See also Documentation/power/pm.txt, pci=noacpi 158 See also Documentation/power/pm.txt, pci=noacpi
155 159
@@ -199,10 +203,6 @@ and is between 256 and 4096 characters. It is defined in the file
199 acpi_display_output=video 203 acpi_display_output=video
200 See above. 204 See above.
201 205
202 acpi_early_pdc_eval [HW,ACPI] Evaluate processor _PDC methods
203 early. Needed on some platforms to properly
204 initialize the EC.
205
206 acpi_irq_balance [HW,ACPI] 206 acpi_irq_balance [HW,ACPI]
207 ACPI will balance active IRQs 207 ACPI will balance active IRQs
208 default in APIC mode 208 default in APIC mode
@@ -290,9 +290,6 @@ and is between 256 and 4096 characters. It is defined in the file
290 advansys= [HW,SCSI] 290 advansys= [HW,SCSI]
291 See header of drivers/scsi/advansys.c. 291 See header of drivers/scsi/advansys.c.
292 292
293 advwdt= [HW,WDT] Advantech WDT
294 Format: <iostart>,<iostop>
295
296 aedsp16= [HW,OSS] Audio Excel DSP 16 293 aedsp16= [HW,OSS] Audio Excel DSP 16
297 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 294 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
298 See also header of sound/oss/aedsp16.c. 295 See also header of sound/oss/aedsp16.c.
@@ -323,15 +320,12 @@ and is between 256 and 4096 characters. It is defined in the file
323 amd_iommu= [HW,X86-84] 320 amd_iommu= [HW,X86-84]
324 Pass parameters to the AMD IOMMU driver in the system. 321 Pass parameters to the AMD IOMMU driver in the system.
325 Possible values are: 322 Possible values are:
326 isolate - enable device isolation (each device, as far
327 as possible, will get its own protection
328 domain) [default]
329 share - put every device behind one IOMMU into the
330 same protection domain
331 fullflush - enable flushing of IO/TLB entries when 323 fullflush - enable flushing of IO/TLB entries when
332 they are unmapped. Otherwise they are 324 they are unmapped. Otherwise they are
333 flushed before they will be reused, which 325 flushed before they will be reused, which
334 is a lot of faster 326 is a lot of faster
327 off - do not initialize any AMD IOMMU found in
328 the system
335 329
336 amijoy.map= [HW,JOY] Amiga joystick support 330 amijoy.map= [HW,JOY] Amiga joystick support
337 Map of devices attached to JOY0DAT and JOY1DAT 331 Map of devices attached to JOY0DAT and JOY1DAT
@@ -356,6 +350,9 @@ and is between 256 and 4096 characters. It is defined in the file
356 Change the amount of debugging information output 350 Change the amount of debugging information output
357 when initialising the APIC and IO-APIC components. 351 when initialising the APIC and IO-APIC components.
358 352
353 autoconf= [IPV6]
354 See Documentation/networking/ipv6.txt.
355
359 show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller 356 show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller
360 Limit apic dumping. The parameter defines the maximal 357 Limit apic dumping. The parameter defines the maximal
361 number of local apics being dumped. Also it is possible 358 number of local apics being dumped. Also it is possible
@@ -638,6 +635,12 @@ and is between 256 and 4096 characters. It is defined in the file
638 See drivers/char/README.epca and 635 See drivers/char/README.epca and
639 Documentation/serial/digiepca.txt. 636 Documentation/serial/digiepca.txt.
640 637
638 disable= [IPV6]
639 See Documentation/networking/ipv6.txt.
640
641 disable_ipv6= [IPV6]
642 See Documentation/networking/ipv6.txt.
643
641 disable_mtrr_cleanup [X86] 644 disable_mtrr_cleanup [X86]
642 The kernel tries to adjust MTRR layout from continuous 645 The kernel tries to adjust MTRR layout from continuous
643 to discrete, to make X server driver able to add WB 646 to discrete, to make X server driver able to add WB
@@ -707,6 +710,12 @@ and is between 256 and 4096 characters. It is defined in the file
707 The VGA output is eventually overwritten by the real 710 The VGA output is eventually overwritten by the real
708 console. 711 console.
709 712
713 ekgdboc= [X86,KGDB] Allow early kernel console debugging
714 ekgdboc=kbd
715
716 This is desgined to be used in conjunction with
717 the boot argument: earlyprintk=vga
718
710 eata= [HW,SCSI] 719 eata= [HW,SCSI]
711 720
712 edd= [EDD] 721 edd= [EDD]
@@ -753,9 +762,6 @@ and is between 256 and 4096 characters. It is defined in the file
753 This option is obsoleted by the "netdev=" option, which 762 This option is obsoleted by the "netdev=" option, which
754 has equivalent usage. See its documentation for details. 763 has equivalent usage. See its documentation for details.
755 764
756 eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog.
757 Format: <io>[,<irq>]
758
759 failslab= 765 failslab=
760 fail_page_alloc= 766 fail_page_alloc=
761 fail_make_request=[KNL] 767 fail_make_request=[KNL]
@@ -783,8 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file
783 as early as possible in order to facilitate early 789 as early as possible in order to facilitate early
784 boot debugging. 790 boot debugging.
785 791
786 ftrace_dump_on_oops 792 ftrace_dump_on_oops[=orig_cpu]
787 [FTRACE] will dump the trace buffers on oops. 793 [FTRACE] will dump the trace buffers on oops.
794 If no parameter is passed, ftrace will dump
795 buffers of all CPUs, but if you pass orig_cpu, it will
796 dump only the buffer of the CPU that triggered the
797 oops.
788 798
789 ftrace_filter=[function-list] 799 ftrace_filter=[function-list]
790 [FTRACE] Limit the functions traced by the function 800 [FTRACE] Limit the functions traced by the function
@@ -1111,10 +1121,26 @@ and is between 256 and 4096 characters. It is defined in the file
1111 use the HighMem zone if it exists, and the Normal 1121 use the HighMem zone if it exists, and the Normal
1112 zone if it does not. 1122 zone if it does not.
1113 1123
1114 kgdboc= [HW] kgdb over consoles. 1124 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
1115 Requires a tty driver that supports console polling. 1125 Format: <Controller#>[,poll interval]
1116 (only serial supported for now) 1126 The controller # is the number of the ehci usb debug
1117 Format: <serial_device>[,baud] 1127 port as it is probed via PCI. The poll interval is
1128 optional and is the number seconds in between
1129 each poll cycle to the debug port in case you need
1130 the functionality for interrupting the kernel with
1131 gdb or control-c on the dbgp connection. When
1132 not using this parameter you use sysrq-g to break into
1133 the kernel debugger.
1134
1135 kgdboc= [KGDB,HW] kgdb over consoles.
1136 Requires a tty driver that supports console polling,
1137 or a supported polling keyboard driver (non-usb).
1138 Serial only format: <serial_device>[,baud]
1139 keyboard only format: kbd
1140 keyboard and serial format: kbd,<serial_device>[,baud]
1141
1142 kgdbwait [KGDB] Stop kernel execution and enter the
1143 kernel debugger at the earliest opportunity.
1118 1144
1119 kmac= [MIPS] korina ethernet MAC address. 1145 kmac= [MIPS] korina ethernet MAC address.
1120 Configure the RouterBoard 532 series on-chip 1146 Configure the RouterBoard 532 series on-chip
@@ -1193,7 +1219,7 @@ and is between 256 and 4096 characters. It is defined in the file
1193 1219
1194 libata.force= [LIBATA] Force configurations. The format is comma 1220 libata.force= [LIBATA] Force configurations. The format is comma
1195 separated list of "[ID:]VAL" where ID is 1221 separated list of "[ID:]VAL" where ID is
1196 PORT[:DEVICE]. PORT and DEVICE are decimal numbers 1222 PORT[.DEVICE]. PORT and DEVICE are decimal numbers
1197 matching port, link or device. Basically, it matches 1223 matching port, link or device. Basically, it matches
1198 the ATA ID string printed on console by libata. If 1224 the ATA ID string printed on console by libata. If
1199 the whole ID part is omitted, the last PORT and DEVICE 1225 the whole ID part is omitted, the last PORT and DEVICE
@@ -1738,6 +1764,9 @@ and is between 256 and 4096 characters. It is defined in the file
1738 nomfgpt [X86-32] Disable Multi-Function General Purpose 1764 nomfgpt [X86-32] Disable Multi-Function General Purpose
1739 Timer usage (for AMD Geode machines). 1765 Timer usage (for AMD Geode machines).
1740 1766
1767 nopat [X86] Disable PAT (page attribute table extension of
1768 pagetables) support.
1769
1741 norandmaps Don't use address space randomization. Equivalent to 1770 norandmaps Don't use address space randomization. Equivalent to
1742 echo 0 > /proc/sys/kernel/randomize_va_space 1771 echo 0 > /proc/sys/kernel/randomize_va_space
1743 1772
@@ -1781,6 +1810,12 @@ and is between 256 and 4096 characters. It is defined in the file
1781 purges which is reported from either PAL_VM_SUMMARY or 1810 purges which is reported from either PAL_VM_SUMMARY or
1782 SAL PALO. 1811 SAL PALO.
1783 1812
1813 nr_cpus= [SMP] Maximum number of processors that an SMP kernel
1814 could support. nr_cpus=n : n >= 1 limits the kernel to
1815 supporting 'n' processors. Later in runtime you can not
1816 use hotplug cpu feature to put more cpu back to online.
1817 just like you compile the kernel NR_CPUS=n
1818
1784 nr_uarts= [SERIAL] maximum number of UARTs to be registered. 1819 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1785 1820
1786 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 1821 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
@@ -1948,8 +1983,12 @@ and is between 256 and 4096 characters. It is defined in the file
1948 IRQ routing is enabled. 1983 IRQ routing is enabled.
1949 noacpi [X86] Do not use ACPI for IRQ routing 1984 noacpi [X86] Do not use ACPI for IRQ routing
1950 or for PCI scanning. 1985 or for PCI scanning.
1951 use_crs [X86] Use _CRS for PCI resource 1986 use_crs [X86] Use PCI host bridge window information
1952 allocation. 1987 from ACPI. On BIOSes from 2008 or later, this
1988 is enabled by default. If you need to use this,
1989 please report a bug.
1990 nocrs [X86] Ignore PCI host bridge windows from ACPI.
1991 If you need to use this, please report a bug.
1953 routeirq Do IRQ routing for all PCI devices. 1992 routeirq Do IRQ routing for all PCI devices.
1954 This is normally done in pci_enable_device(), 1993 This is normally done in pci_enable_device(),
1955 so this option is a temporary workaround 1994 so this option is a temporary workaround
@@ -1998,6 +2037,14 @@ and is between 256 and 4096 characters. It is defined in the file
1998 force Enable ASPM even on devices that claim not to support it. 2037 force Enable ASPM even on devices that claim not to support it.
1999 WARNING: Forcing ASPM on may cause system lockups. 2038 WARNING: Forcing ASPM on may cause system lockups.
2000 2039
2040 pcie_pme= [PCIE,PM] Native PCIe PME signaling options:
2041 off Do not use native PCIe PME signaling.
2042 force Use native PCIe PME signaling even if the BIOS refuses
2043 to allow the kernel to control the relevant PCIe config
2044 registers.
2045 nomsi Do not use MSI for native PCIe PME signaling (this makes
2046 all PCIe root ports use INTx for everything).
2047
2001 pcmv= [HW,PCMCIA] BadgePAD 4 2048 pcmv= [HW,PCMCIA] BadgePAD 4
2002 2049
2003 pd. [PARIDE] 2050 pd. [PARIDE]
@@ -2214,9 +2261,6 @@ and is between 256 and 4096 characters. It is defined in the file
2214 2261
2215 sched_debug [KNL] Enables verbose scheduler debug messages. 2262 sched_debug [KNL] Enables verbose scheduler debug messages.
2216 2263
2217 sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
2218 Format: <io>[,<timeout>[,<isapnp>]]
2219
2220 scsi_debug_*= [SCSI] 2264 scsi_debug_*= [SCSI]
2221 See drivers/scsi/scsi_debug.c. 2265 See drivers/scsi/scsi_debug.c.
2222 2266
@@ -2588,6 +2632,15 @@ and is between 256 and 4096 characters. It is defined in the file
2588 2632
2589 tp720= [HW,PS2] 2633 tp720= [HW,PS2]
2590 2634
2635 tpm_suspend_pcr=[HW,TPM]
2636 Format: integer pcr id
2637 Specify that at suspend time, the tpm driver
2638 should extend the specified pcr with zeros,
2639 as a workaround for some chips which fail to
2640 flush the last written pcr on TPM_SaveState.
2641 This will guarantee that all the other pcrs
2642 are saved.
2643
2591 trace_buf_size=nn[KMG] 2644 trace_buf_size=nn[KMG]
2592 [FTRACE] will set tracing buffer size. 2645 [FTRACE] will set tracing buffer size.
2593 2646
@@ -2703,6 +2756,13 @@ and is between 256 and 4096 characters. It is defined in the file
2703 medium is write-protected). 2756 medium is write-protected).
2704 Example: quirks=0419:aaf5:rl,0421:0433:rc 2757 Example: quirks=0419:aaf5:rl,0421:0433:rc
2705 2758
2759 userpte=
2760 [X86] Flags controlling user PTE allocations.
2761
2762 nohigh = do not allocate PTE pages in
2763 HIGHMEM regardless of setting
2764 of CONFIG_HIGHPTE.
2765
2706 vdso= [X86,SH] 2766 vdso= [X86,SH]
2707 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2767 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2708 vdso=1: enable VDSO (default) 2768 vdso=1: enable VDSO (default)
@@ -2789,13 +2849,21 @@ and is between 256 and 4096 characters. It is defined in the file
2789 wd7000= [HW,SCSI] 2849 wd7000= [HW,SCSI]
2790 See header of drivers/scsi/wd7000.c. 2850 See header of drivers/scsi/wd7000.c.
2791 2851
2792 wdt= [WDT] Watchdog 2852 watchdog timers [HW,WDT] For information on watchdog timers,
2793 See Documentation/watchdog/wdt.txt. 2853 see Documentation/watchdog/watchdog-parameters.txt
2854 or other driver-specific files in the
2855 Documentation/watchdog/ directory.
2794 2856
2795 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of 2857 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2796 default x2apic cluster mode on platforms 2858 default x2apic cluster mode on platforms
2797 supporting x2apic. 2859 supporting x2apic.
2798 2860
2861 x86_mrst_timer= [X86-32,APBT]
2862 Choose timer option for x86 Moorestown MID platform.
2863 Two valid options are apbt timer only and lapic timer
2864 plus one apbt timer for broadcast timer.
2865 x86_mrst_timer=apbt_only | lapic_and_apbt
2866
2799 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. 2867 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
2800 xd_geo= See header of drivers/block/xd.c. 2868 xd_geo= See header of drivers/block/xd.c.
2801 2869
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index c79ab996dada..3ab2472509cb 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -59,37 +59,56 @@ nice to have in other objects. The C language does not allow for the
59direct expression of inheritance, so other techniques - such as structure 59direct expression of inheritance, so other techniques - such as structure
60embedding - must be used. 60embedding - must be used.
61 61
62So, for example, the UIO code has a structure that defines the memory 62(As an aside, for those familiar with the kernel linked list implementation,
63region associated with a uio device: 63this is analogous as to how "list_head" structs are rarely useful on
64their own, but are invariably found embedded in the larger objects of
65interest.)
64 66
65struct uio_mem { 67So, for example, the UIO code in drivers/uio/uio.c has a structure that
68defines the memory region associated with a uio device:
69
70 struct uio_map {
66 struct kobject kobj; 71 struct kobject kobj;
67 unsigned long addr; 72 struct uio_mem *mem;
68 unsigned long size; 73 };
69 int memtype;
70 void __iomem *internal_addr;
71};
72 74
73If you have a struct uio_mem structure, finding its embedded kobject is 75If you have a struct uio_map structure, finding its embedded kobject is
74just a matter of using the kobj member. Code that works with kobjects will 76just a matter of using the kobj member. Code that works with kobjects will
75often have the opposite problem, however: given a struct kobject pointer, 77often have the opposite problem, however: given a struct kobject pointer,
76what is the pointer to the containing structure? You must avoid tricks 78what is the pointer to the containing structure? You must avoid tricks
77(such as assuming that the kobject is at the beginning of the structure) 79(such as assuming that the kobject is at the beginning of the structure)
78and, instead, use the container_of() macro, found in <linux/kernel.h>: 80and, instead, use the container_of() macro, found in <linux/kernel.h>:
79 81
80 container_of(pointer, type, member) 82 container_of(pointer, type, member)
83
84where:
85
86 * "pointer" is the pointer to the embedded kobject,
87 * "type" is the type of the containing structure, and
88 * "member" is the name of the structure field to which "pointer" points.
89
90The return value from container_of() is a pointer to the corresponding
91container type. So, for example, a pointer "kp" to a struct kobject
92embedded *within* a struct uio_map could be converted to a pointer to the
93*containing* uio_map structure with:
94
95 struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
96
97For convenience, programmers often define a simple macro for "back-casting"
98kobject pointers to the containing type. Exactly this happens in the
99earlier drivers/uio/uio.c, as you can see here:
100
101 struct uio_map {
102 struct kobject kobj;
103 struct uio_mem *mem;
104 };
81 105
82where pointer is the pointer to the embedded kobject, type is the type of 106 #define to_map(map) container_of(map, struct uio_map, kobj)
83the containing structure, and member is the name of the structure field to
84which pointer points. The return value from container_of() is a pointer to
85the given type. So, for example, a pointer "kp" to a struct kobject
86embedded within a struct uio_mem could be converted to a pointer to the
87containing uio_mem structure with:
88 107
89 struct uio_mem *u_mem = container_of(kp, struct uio_mem, kobj); 108where the macro argument "map" is a pointer to the struct kobject in
109question. That macro is subsequently invoked with:
90 110
91Programmers often define a simple macro for "back-casting" kobject pointers 111 struct uio_map *map = to_map(kobj);
92to the containing type.
93 112
94 113
95Initialization of kobjects 114Initialization of kobjects
@@ -266,7 +285,7 @@ kobj_type:
266 285
267 struct kobj_type { 286 struct kobj_type {
268 void (*release)(struct kobject *); 287 void (*release)(struct kobject *);
269 struct sysfs_ops *sysfs_ops; 288 const struct sysfs_ops *sysfs_ops;
270 struct attribute **default_attrs; 289 struct attribute **default_attrs;
271 }; 290 };
272 291
@@ -387,4 +406,5 @@ called, and the objects in the former circle release each other.
387Example code to copy from 406Example code to copy from
388 407
389For a more complete example of using ksets and kobjects properly, see the 408For a more complete example of using ksets and kobjects properly, see the
390sample/kobject/kset-example.c code. 409example programs samples/kobject/{kobject-example.c,kset-example.c},
410which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT.
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 053037a1fe6d..6653017680dd 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -1,6 +1,7 @@
1Title : Kernel Probes (Kprobes) 1Title : Kernel Probes (Kprobes)
2Authors : Jim Keniston <jkenisto@us.ibm.com> 2Authors : Jim Keniston <jkenisto@us.ibm.com>
3 : Prasanna S Panchamukhi <prasanna@in.ibm.com> 3 : Prasanna S Panchamukhi <prasanna.panchamukhi@gmail.com>
4 : Masami Hiramatsu <mhiramat@redhat.com>
4 5
5CONTENTS 6CONTENTS
6 7
@@ -15,6 +16,7 @@ CONTENTS
159. Jprobes Example 169. Jprobes Example
1610. Kretprobes Example 1710. Kretprobes Example
17Appendix A: The kprobes debugfs interface 18Appendix A: The kprobes debugfs interface
19Appendix B: The kprobes sysctl interface
18 20
191. Concepts: Kprobes, Jprobes, Return Probes 211. Concepts: Kprobes, Jprobes, Return Probes
20 22
@@ -42,13 +44,13 @@ registration/unregistration of a group of *probes. These functions
42can speed up unregistration process when you have to unregister 44can speed up unregistration process when you have to unregister
43a lot of probes at once. 45a lot of probes at once.
44 46
45The next three subsections explain how the different types of 47The next four subsections explain how the different types of
46probes work. They explain certain things that you'll need to 48probes work and how jump optimization works. They explain certain
47know in order to make the best use of Kprobes -- e.g., the 49things that you'll need to know in order to make the best use of
48difference between a pre_handler and a post_handler, and how 50Kprobes -- e.g., the difference between a pre_handler and
49to use the maxactive and nmissed fields of a kretprobe. But 51a post_handler, and how to use the maxactive and nmissed fields of
50if you're in a hurry to start using Kprobes, you can skip ahead 52a kretprobe. But if you're in a hurry to start using Kprobes, you
51to section 2. 53can skip ahead to section 2.
52 54
531.1 How Does a Kprobe Work? 551.1 How Does a Kprobe Work?
54 56
@@ -161,13 +163,123 @@ In case probed function is entered but there is no kretprobe_instance
161object available, then in addition to incrementing the nmissed count, 163object available, then in addition to incrementing the nmissed count,
162the user entry_handler invocation is also skipped. 164the user entry_handler invocation is also skipped.
163 165
1661.4 How Does Jump Optimization Work?
167
168If your kernel is built with CONFIG_OPTPROBES=y (currently this flag
169is automatically set 'y' on x86/x86-64, non-preemptive kernel) and
170the "debug.kprobes_optimization" kernel parameter is set to 1 (see
171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
172instruction instead of a breakpoint instruction at each probepoint.
173
1741.4.1 Init a Kprobe
175
176When a probe is registered, before attempting this optimization,
177Kprobes inserts an ordinary, breakpoint-based kprobe at the specified
178address. So, even if it's not possible to optimize this particular
179probepoint, there'll be a probe there.
180
1811.4.2 Safety Check
182
183Before optimizing a probe, Kprobes performs the following safety checks:
184
185- Kprobes verifies that the region that will be replaced by the jump
186instruction (the "optimized region") lies entirely within one function.
187(A jump instruction is multiple bytes, and so may overlay multiple
188instructions.)
189
190- Kprobes analyzes the entire function and verifies that there is no
191jump into the optimized region. Specifically:
192 - the function contains no indirect jump;
193 - the function contains no instruction that causes an exception (since
194 the fixup code triggered by the exception could jump back into the
195 optimized region -- Kprobes checks the exception tables to verify this);
196 and
197 - there is no near jump to the optimized region (other than to the first
198 byte).
199
200- For each instruction in the optimized region, Kprobes verifies that
201the instruction can be executed out of line.
202
2031.4.3 Preparing Detour Buffer
204
205Next, Kprobes prepares a "detour" buffer, which contains the following
206instruction sequence:
207- code to push the CPU's registers (emulating a breakpoint trap)
208- a call to the trampoline code which calls user's probe handlers.
209- code to restore registers
210- the instructions from the optimized region
211- a jump back to the original execution path.
212
2131.4.4 Pre-optimization
214
215After preparing the detour buffer, Kprobes verifies that none of the
216following situations exist:
217- The probe has either a break_handler (i.e., it's a jprobe) or a
218post_handler.
219- Other instructions in the optimized region are probed.
220- The probe is disabled.
221In any of the above cases, Kprobes won't start optimizing the probe.
222Since these are temporary situations, Kprobes tries to start
223optimizing it again if the situation is changed.
224
225If the kprobe can be optimized, Kprobes enqueues the kprobe to an
226optimizing list, and kicks the kprobe-optimizer workqueue to optimize
227it. If the to-be-optimized probepoint is hit before being optimized,
228Kprobes returns control to the original instruction path by setting
229the CPU's instruction pointer to the copied code in the detour buffer
230-- thus at least avoiding the single-step.
231
2321.4.5 Optimization
233
234The Kprobe-optimizer doesn't insert the jump instruction immediately;
235rather, it calls synchronize_sched() for safety first, because it's
236possible for a CPU to be interrupted in the middle of executing the
237optimized region(*). As you know, synchronize_sched() can ensure
238that all interruptions that were active when synchronize_sched()
239was called are done, but only if CONFIG_PREEMPT=n. So, this version
240of kprobe optimization supports only kernels with CONFIG_PREEMPT=n.(**)
241
242After that, the Kprobe-optimizer calls stop_machine() to replace
243the optimized region with a jump instruction to the detour buffer,
244using text_poke_smp().
245
2461.4.6 Unoptimization
247
248When an optimized kprobe is unregistered, disabled, or blocked by
249another kprobe, it will be unoptimized. If this happens before
250the optimization is complete, the kprobe is just dequeued from the
251optimized list. If the optimization has been done, the jump is
252replaced with the original code (except for an int3 breakpoint in
253the first byte) by using text_poke_smp().
254
255(*)Please imagine that the 2nd instruction is interrupted and then
256the optimizer replaces the 2nd instruction with the jump *address*
257while the interrupt handler is running. When the interrupt
258returns to original address, there is no valid instruction,
259and it causes an unexpected result.
260
261(**)This optimization-safety checking may be replaced with the
262stop-machine method that ksplice uses for supporting a CONFIG_PREEMPT=y
263kernel.
264
265NOTE for geeks:
266The jump optimization changes the kprobe's pre_handler behavior.
267Without optimization, the pre_handler can change the kernel's execution
268path by changing regs->ip and returning 1. However, when the probe
269is optimized, that modification is ignored. Thus, if you want to
270tweak the kernel's execution path, you need to suppress optimization,
271using one of the following techniques:
272- Specify an empty function for the kprobe's post_handler or break_handler.
273 or
274- Execute 'sysctl -w debug.kprobes_optimization=n'
275
1642. Architectures Supported 2762. Architectures Supported
165 277
166Kprobes, jprobes, and return probes are implemented on the following 278Kprobes, jprobes, and return probes are implemented on the following
167architectures: 279architectures:
168 280
169- i386 281- i386 (Supports jump optimization)
170- x86_64 (AMD-64, EM64T) 282- x86_64 (AMD-64, EM64T) (Supports jump optimization)
171- ppc64 283- ppc64
172- ia64 (Does not support probes on instruction slot1.) 284- ia64 (Does not support probes on instruction slot1.)
173- sparc64 (Return probes not yet implemented.) 285- sparc64 (Return probes not yet implemented.)
@@ -214,7 +326,7 @@ occurs during execution of kp->pre_handler or kp->post_handler,
214or during single-stepping of the probed instruction, Kprobes calls 326or during single-stepping of the probed instruction, Kprobes calls
215kp->fault_handler. Any or all handlers can be NULL. If kp->flags 327kp->fault_handler. Any or all handlers can be NULL. If kp->flags
216is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, 328is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
217so, it's handlers aren't hit until calling enable_kprobe(kp). 329so, its handlers aren't hit until calling enable_kprobe(kp).
218 330
219NOTE: 331NOTE:
2201. With the introduction of the "symbol_name" field to struct kprobe, 3321. With the introduction of the "symbol_name" field to struct kprobe,
@@ -389,7 +501,10 @@ the probe which has been registered.
389 501
390Kprobes allows multiple probes at the same address. Currently, 502Kprobes allows multiple probes at the same address. Currently,
391however, there cannot be multiple jprobes on the same function at 503however, there cannot be multiple jprobes on the same function at
392the same time. 504the same time. Also, a probepoint for which there is a jprobe or
505a post_handler cannot be optimized. So if you install a jprobe,
506or a kprobe with a post_handler, at an optimized probepoint, the
507probepoint will be unoptimized automatically.
393 508
394In general, you can install a probe anywhere in the kernel. 509In general, you can install a probe anywhere in the kernel.
395In particular, you can probe interrupt handlers. Known exceptions 510In particular, you can probe interrupt handlers. Known exceptions
@@ -453,6 +568,38 @@ reason, Kprobes doesn't support return probes (or kprobes or jprobes)
453on the x86_64 version of __switch_to(); the registration functions 568on the x86_64 version of __switch_to(); the registration functions
454return -EINVAL. 569return -EINVAL.
455 570
571On x86/x86-64, since the Jump Optimization of Kprobes modifies
572instructions widely, there are some limitations to optimization. To
573explain it, we introduce some terminology. Imagine a 3-instruction
574sequence consisting of a two 2-byte instructions and one 3-byte
575instruction.
576
577 IA
578 |
579[-2][-1][0][1][2][3][4][5][6][7]
580 [ins1][ins2][ ins3 ]
581 [<- DCR ->]
582 [<- JTPR ->]
583
584ins1: 1st Instruction
585ins2: 2nd Instruction
586ins3: 3rd Instruction
587IA: Insertion Address
588JTPR: Jump Target Prohibition Region
589DCR: Detoured Code Region
590
591The instructions in DCR are copied to the out-of-line buffer
592of the kprobe, because the bytes in DCR are replaced by
593a 5-byte jump instruction. So there are several limitations.
594
595a) The instructions in DCR must be relocatable.
596b) The instructions in DCR must not include a call instruction.
597c) JTPR must not be targeted by any jump or call instruction.
598d) DCR must not straddle the border betweeen functions.
599
600Anyway, these limitations are checked by the in-kernel instruction
601decoder, so you don't need to worry about that.
602
4566. Probe Overhead 6036. Probe Overhead
457 604
458On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0 605On a typical CPU in use in 2005, a kprobe hit takes 0.5 to 1.0
@@ -476,6 +623,19 @@ k = 0.49 usec; j = 0.76; r = 0.80; kr = 0.82; jr = 1.07
476ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU) 623ppc64: POWER5 (gr), 1656 MHz (SMT disabled, 1 virtual CPU per physical CPU)
477k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99 624k = 0.77 usec; j = 1.31; r = 1.26; kr = 1.45; jr = 1.99
478 625
6266.1 Optimized Probe Overhead
627
628Typically, an optimized kprobe hit takes 0.07 to 0.1 microseconds to
629process. Here are sample overhead figures (in usec) for x86 architectures.
630k = unoptimized kprobe, b = boosted (single-step skipped), o = optimized kprobe,
631r = unoptimized kretprobe, rb = boosted kretprobe, ro = optimized kretprobe.
632
633i386: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
634k = 0.80 usec; b = 0.33; o = 0.05; r = 1.10; rb = 0.61; ro = 0.33
635
636x86-64: Intel(R) Xeon(R) E5410, 2.33GHz, 4656.90 bogomips
637k = 0.99 usec; b = 0.43; o = 0.06; r = 1.24; rb = 0.68; ro = 0.30
638
4797. TODO 6397. TODO
480 640
481a. SystemTap (http://sourceware.org/systemtap): Provides a simplified 641a. SystemTap (http://sourceware.org/systemtap): Provides a simplified
@@ -523,7 +683,8 @@ is also specified. Following columns show probe status. If the probe is on
523a virtual address that is no longer valid (module init sections, module 683a virtual address that is no longer valid (module init sections, module
524virtual addresses that correspond to modules that've been unloaded), 684virtual addresses that correspond to modules that've been unloaded),
525such probes are marked with [GONE]. If the probe is temporarily disabled, 685such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED]. 686such probes are marked with [DISABLED]. If the probe is optimized, it is
687marked with [OPTIMIZED].
527 688
528/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. 689/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
529 690
@@ -533,3 +694,19 @@ registered probes will be disarmed, till such time a "1" is echoed to this
533file. Note that this knob just disarms and arms all kprobes and doesn't 694file. Note that this knob just disarms and arms all kprobes and doesn't
534change each probe's disabling state. This means that disabled kprobes (marked 695change each probe's disabling state. This means that disabled kprobes (marked
535[DISABLED]) will be not enabled if you turn ON all kprobes by this knob. 696[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
697
698
699Appendix B: The kprobes sysctl interface
700
701/proc/sys/debug/kprobes-optimization: Turn kprobes optimization ON/OFF.
702
703When CONFIG_OPTPROBES=y, this sysctl interface appears and it provides
704a knob to globally and forcibly turn jump optimization (see section
7051.4) ON or OFF. By default, jump optimization is allowed (ON).
706If you echo "0" to this file or set "debug.kprobes_optimization" to
7070 via sysctl, all optimized probes will be unoptimized, and any new
708probes registered after that will not be optimized. Note that this
709knob *changes* the optimized state. This means that optimized probes
710(marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be
711removed). If the knob is turned on, they will be optimized again.
712
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 2811e452f756..a237518e51b9 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -23,12 +23,12 @@ of a virtual machine. The ioctls belong to three classes
23 Only run vcpu ioctls from the same thread that was used to create the 23 Only run vcpu ioctls from the same thread that was used to create the
24 vcpu. 24 vcpu.
25 25
262. File descritpors 262. File descriptors
27 27
28The kvm API is centered around file descriptors. An initial 28The kvm API is centered around file descriptors. An initial
29open("/dev/kvm") obtains a handle to the kvm subsystem; this handle 29open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
30can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this 30can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
31handle will create a VM file descripror which can be used to issue VM 31handle will create a VM file descriptor which can be used to issue VM
32ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu 32ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
33and return a file descriptor pointing to it. Finally, ioctls on a vcpu 33and return a file descriptor pointing to it. Finally, ioctls on a vcpu
34fd can be used to control the vcpu, including the important task of 34fd can be used to control the vcpu, including the important task of
@@ -643,7 +643,7 @@ Type: vm ioctl
643Parameters: struct kvm_clock_data (in) 643Parameters: struct kvm_clock_data (in)
644Returns: 0 on success, -1 on error 644Returns: 0 on success, -1 on error
645 645
646Sets the current timestamp of kvmclock to the valued specific in its parameter. 646Sets the current timestamp of kvmclock to the value specified in its parameter.
647In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios 647In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
648such as migration. 648such as migration.
649 649
@@ -656,6 +656,7 @@ struct kvm_clock_data {
6564.29 KVM_GET_VCPU_EVENTS 6564.29 KVM_GET_VCPU_EVENTS
657 657
658Capability: KVM_CAP_VCPU_EVENTS 658Capability: KVM_CAP_VCPU_EVENTS
659Extended by: KVM_CAP_INTR_SHADOW
659Architectures: x86 660Architectures: x86
660Type: vm ioctl 661Type: vm ioctl
661Parameters: struct kvm_vcpu_event (out) 662Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
676 __u8 injected; 677 __u8 injected;
677 __u8 nr; 678 __u8 nr;
678 __u8 soft; 679 __u8 soft;
679 __u8 pad; 680 __u8 shadow;
680 } interrupt; 681 } interrupt;
681 struct { 682 struct {
682 __u8 injected; 683 __u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
688 __u32 flags; 689 __u32 flags;
689}; 690};
690 691
692KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
693interrupt.shadow contains a valid state. Otherwise, this field is undefined.
694
6914.30 KVM_SET_VCPU_EVENTS 6954.30 KVM_SET_VCPU_EVENTS
692 696
693Capability: KVM_CAP_VCPU_EVENTS 697Capability: KVM_CAP_VCPU_EVENTS
698Extended by: KVM_CAP_INTR_SHADOW
694Architectures: x86 699Architectures: x86
695Type: vm ioctl 700Type: vm ioctl
696Parameters: struct kvm_vcpu_event (in) 701Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,183 @@ current in-kernel state. The bits are:
709KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel 714KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
710KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector 715KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
711 716
717If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
718the flags field to signal that interrupt.shadow contains a valid state and
719shall be written into the VCPU.
720
7214.32 KVM_GET_DEBUGREGS
722
723Capability: KVM_CAP_DEBUGREGS
724Architectures: x86
725Type: vm ioctl
726Parameters: struct kvm_debugregs (out)
727Returns: 0 on success, -1 on error
728
729Reads debug registers from the vcpu.
730
731struct kvm_debugregs {
732 __u64 db[4];
733 __u64 dr6;
734 __u64 dr7;
735 __u64 flags;
736 __u64 reserved[9];
737};
738
7394.33 KVM_SET_DEBUGREGS
740
741Capability: KVM_CAP_DEBUGREGS
742Architectures: x86
743Type: vm ioctl
744Parameters: struct kvm_debugregs (in)
745Returns: 0 on success, -1 on error
746
747Writes debug registers into the vcpu.
748
749See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
750yet and must be cleared on entry.
751
7524.34 KVM_SET_USER_MEMORY_REGION
753
754Capability: KVM_CAP_USER_MEM
755Architectures: all
756Type: vm ioctl
757Parameters: struct kvm_userspace_memory_region (in)
758Returns: 0 on success, -1 on error
759
760struct kvm_userspace_memory_region {
761 __u32 slot;
762 __u32 flags;
763 __u64 guest_phys_addr;
764 __u64 memory_size; /* bytes */
765 __u64 userspace_addr; /* start of the userspace allocated memory */
766};
767
768/* for kvm_memory_region::flags */
769#define KVM_MEM_LOG_DIRTY_PAGES 1UL
770
771This ioctl allows the user to create or modify a guest physical memory
772slot. When changing an existing slot, it may be moved in the guest
773physical memory space, or its flags may be modified. It may not be
774resized. Slots may not overlap in guest physical address space.
775
776Memory for the region is taken starting at the address denoted by the
777field userspace_addr, which must point at user addressable memory for
778the entire memory slot size. Any object may back this memory, including
779anonymous memory, ordinary files, and hugetlbfs.
780
781It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
782be identical. This allows large pages in the guest to be backed by large
783pages in the host.
784
785The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
786instructs kvm to keep track of writes to memory within the slot. See
787the KVM_GET_DIRTY_LOG ioctl.
788
789When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
790region are automatically reflected into the guest. For example, an mmap()
791that affects the region will be made visible immediately. Another example
792is madvise(MADV_DROP).
793
794It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
795The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
796allocation and is deprecated.
797
7984.35 KVM_SET_TSS_ADDR
799
800Capability: KVM_CAP_SET_TSS_ADDR
801Architectures: x86
802Type: vm ioctl
803Parameters: unsigned long tss_address (in)
804Returns: 0 on success, -1 on error
805
806This ioctl defines the physical address of a three-page region in the guest
807physical address space. The region must be within the first 4GB of the
808guest physical address space and must not conflict with any memory slot
809or any mmio address. The guest may malfunction if it accesses this memory
810region.
811
812This ioctl is required on Intel-based hosts. This is needed on Intel hardware
813because of a quirk in the virtualization implementation (see the internals
814documentation when it pops into existence).
815
8164.36 KVM_ENABLE_CAP
817
818Capability: KVM_CAP_ENABLE_CAP
819Architectures: ppc
820Type: vcpu ioctl
821Parameters: struct kvm_enable_cap (in)
822Returns: 0 on success; -1 on error
823
824+Not all extensions are enabled by default. Using this ioctl the application
825can enable an extension, making it available to the guest.
826
827On systems that do not support this ioctl, it always fails. On systems that
828do support it, it only works for extensions that are supported for enablement.
829
830To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should
831be used.
832
833struct kvm_enable_cap {
834 /* in */
835 __u32 cap;
836
837The capability that is supposed to get enabled.
838
839 __u32 flags;
840
841A bitfield indicating future enhancements. Has to be 0 for now.
842
843 __u64 args[4];
844
845Arguments for enabling a feature. If a feature needs initial values to
846function properly, this is the place to put them.
847
848 __u8 pad[64];
849};
850
8514.37 KVM_GET_MP_STATE
852
853Capability: KVM_CAP_MP_STATE
854Architectures: x86, ia64
855Type: vcpu ioctl
856Parameters: struct kvm_mp_state (out)
857Returns: 0 on success; -1 on error
858
859struct kvm_mp_state {
860 __u32 mp_state;
861};
862
863Returns the vcpu's current "multiprocessing state" (though also valid on
864uniprocessor guests).
865
866Possible values are:
867
868 - KVM_MP_STATE_RUNNABLE: the vcpu is currently running
869 - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP)
870 which has not yet received an INIT signal
871 - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is
872 now ready for a SIPI
873 - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and
874 is waiting for an interrupt
875 - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector
876 accesible via KVM_GET_VCPU_EVENTS)
877
878This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
879irqchip, the multiprocessing state must be maintained by userspace.
880
8814.38 KVM_SET_MP_STATE
882
883Capability: KVM_CAP_MP_STATE
884Architectures: x86, ia64
885Type: vcpu ioctl
886Parameters: struct kvm_mp_state (in)
887Returns: 0 on success; -1 on error
888
889Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
890arguments.
891
892This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel
893irqchip, the multiprocessing state must be maintained by userspace.
712 894
7135. The kvm_run structure 8955. The kvm_run structure
714 896
@@ -795,11 +977,11 @@ Unused.
795 __u64 data_offset; /* relative to kvm_run start */ 977 __u64 data_offset; /* relative to kvm_run start */
796 } io; 978 } io;
797 979
798If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has 980If exit_reason is KVM_EXIT_IO, then the vcpu has
799executed a port I/O instruction which could not be satisfied by kvm. 981executed a port I/O instruction which could not be satisfied by kvm.
800data_offset describes where the data is located (KVM_EXIT_IO_OUT) or 982data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
801where kvm expects application code to place the data for the next 983where kvm expects application code to place the data for the next
802KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array. 984KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array.
803 985
804 struct { 986 struct {
805 struct kvm_debug_exit_arch arch; 987 struct kvm_debug_exit_arch arch;
@@ -815,11 +997,18 @@ Unused.
815 __u8 is_write; 997 __u8 is_write;
816 } mmio; 998 } mmio;
817 999
818If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has 1000If exit_reason is KVM_EXIT_MMIO, then the vcpu has
819executed a memory-mapped I/O instruction which could not be satisfied 1001executed a memory-mapped I/O instruction which could not be satisfied
820by kvm. The 'data' member contains the written data if 'is_write' is 1002by kvm. The 'data' member contains the written data if 'is_write' is
821true, and should be filled by application code otherwise. 1003true, and should be filled by application code otherwise.
822 1004
1005NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
1006operations are complete (and guest state is consistent) only after userspace
1007has re-entered the kernel with KVM_RUN. The kernel side will first finish
1008incomplete operations and then check for pending signals. Userspace
1009can re-enter the guest with an unmasked signal pending to complete
1010pending operations.
1011
823 /* KVM_EXIT_HYPERCALL */ 1012 /* KVM_EXIT_HYPERCALL */
824 struct { 1013 struct {
825 __u64 nr; 1014 __u64 nr;
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise.
829 __u32 pad; 1018 __u32 pad;
830 } hypercall; 1019 } hypercall;
831 1020
832Unused. 1021Unused. This was once used for 'hypercall to userspace'. To implement
1022such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
1023Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.
833 1024
834 /* KVM_EXIT_TPR_ACCESS */ 1025 /* KVM_EXIT_TPR_ACCESS */
835 struct { 1026 struct {
@@ -870,6 +1061,19 @@ s390 specific.
870 1061
871powerpc specific. 1062powerpc specific.
872 1063
1064 /* KVM_EXIT_OSI */
1065 struct {
1066 __u64 gprs[32];
1067 } osi;
1068
1069MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch
1070hypercalls and exit with this exit struct that contains all the guest gprs.
1071
1072If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall.
1073Userspace can now handle the hypercall and when it's done modify the gprs as
1074necessary. Upon guest entry all guest GPRs will then be replaced by the values
1075in this struct.
1076
873 /* Fix the size of the union. */ 1077 /* Fix the size of the union. */
874 char padding[256]; 1078 char padding[256];
875 }; 1079 };
diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt
new file mode 100644
index 000000000000..14a12ea92b7f
--- /dev/null
+++ b/Documentation/kvm/cpuid.txt
@@ -0,0 +1,42 @@
1KVM CPUID bits
2Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
3=====================================================
4
5A guest running on a kvm host, can check some of its features using
6cpuid. This is not always guaranteed to work, since userspace can
7mask-out some, or even all KVM-related cpuid features before launching
8a guest.
9
10KVM cpuid functions are:
11
12function: KVM_CPUID_SIGNATURE (0x40000000)
13returns : eax = 0,
14 ebx = 0x4b4d564b,
15 ecx = 0x564b4d56,
16 edx = 0x4d.
17Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
18This function queries the presence of KVM cpuid leafs.
19
20
21function: define KVM_CPUID_FEATURES (0x40000001)
22returns : ebx, ecx, edx = 0
23 eax = and OR'ed group of (1 << flag), where each flags is:
24
25
26flag || value || meaning
27=============================================================================
28KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs
29 || || 0x11 and 0x12.
30------------------------------------------------------------------------------
31KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays
32 || || on PIO operations.
33------------------------------------------------------------------------------
34KVM_FEATURE_MMU_OP || 2 || deprecated.
35------------------------------------------------------------------------------
36KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
37 || || 0x4b564d00 and 0x4b564d01
38------------------------------------------------------------------------------
39KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
40 || || per-cpu warps are expected in
41 || || kvmclock.
42------------------------------------------------------------------------------
diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
new file mode 100644
index 000000000000..aaed6ab9d7ab
--- /dev/null
+++ b/Documentation/kvm/mmu.txt
@@ -0,0 +1,304 @@
1The x86 kvm shadow mmu
2======================
3
4The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible
5for presenting a standard x86 mmu to the guest, while translating guest
6physical addresses to host physical addresses.
7
8The mmu code attempts to satisfy the following requirements:
9
10- correctness: the guest should not be able to determine that it is running
11 on an emulated mmu except for timing (we attempt to comply
12 with the specification, not emulate the characteristics of
13 a particular implementation such as tlb size)
14- security: the guest must not be able to touch host memory not assigned
15 to it
16- performance: minimize the performance penalty imposed by the mmu
17- scaling: need to scale to large memory and large vcpu guests
18- hardware: support the full range of x86 virtualization hardware
19- integration: Linux memory management code must be in control of guest memory
20 so that swapping, page migration, page merging, transparent
21 hugepages, and similar features work without change
22- dirty tracking: report writes to guest memory to enable live migration
23 and framebuffer-based displays
24- footprint: keep the amount of pinned kernel memory low (most memory
25 should be shrinkable)
26- reliablity: avoid multipage or GFP_ATOMIC allocations
27
28Acronyms
29========
30
31pfn host page frame number
32hpa host physical address
33hva host virtual address
34gfn guest frame number
35gpa guest physical address
36gva guest virtual address
37ngpa nested guest physical address
38ngva nested guest virtual address
39pte page table entry (used also to refer generically to paging structure
40 entries)
41gpte guest pte (referring to gfns)
42spte shadow pte (referring to pfns)
43tdp two dimensional paging (vendor neutral term for NPT and EPT)
44
45Virtual and real hardware supported
46===================================
47
48The mmu supports first-generation mmu hardware, which allows an atomic switch
49of the current paging mode and cr3 during guest entry, as well as
50two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware
51it exposes is the traditional 2/3/4 level x86 mmu, with support for global
52pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support
53exposing NPT capable hardware on NPT capable hosts.
54
55Translation
56===========
57
58The primary job of the mmu is to program the processor's mmu to translate
59addresses for the guest. Different translations are required at different
60times:
61
62- when guest paging is disabled, we translate guest physical addresses to
63 host physical addresses (gpa->hpa)
64- when guest paging is enabled, we translate guest virtual addresses, to
65 guest physical addresses, to host physical addresses (gva->gpa->hpa)
66- when the guest launches a guest of its own, we translate nested guest
67 virtual addresses, to nested guest physical addresses, to guest physical
68 addresses, to host physical addresses (ngva->ngpa->gpa->hpa)
69
70The primary challenge is to encode between 1 and 3 translations into hardware
71that support only 1 (traditional) and 2 (tdp) translations. When the
72number of required translations matches the hardware, the mmu operates in
73direct mode; otherwise it operates in shadow mode (see below).
74
75Memory
76======
77
78Guest memory (gpa) is part of the user address space of the process that is
79using kvm. Userspace defines the translation between guest addresses and user
80addresses (gpa->hva); note that two gpas may alias to the same gva, but not
81vice versa.
82
83These gvas may be backed using any method available to the host: anonymous
84memory, file backed memory, and device memory. Memory might be paged by the
85host at any time.
86
87Events
88======
89
90The mmu is driven by events, some from the guest, some from the host.
91
92Guest generated events:
93- writes to control registers (especially cr3)
94- invlpg/invlpga instruction execution
95- access to missing or protected translations
96
97Host generated events:
98- changes in the gpa->hpa translation (either through gpa->hva changes or
99 through hva->hpa changes)
100- memory pressure (the shrinker)
101
102Shadow pages
103============
104
105The principal data structure is the shadow page, 'struct kvm_mmu_page'. A
106shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A
107shadow page may contain a mix of leaf and nonleaf sptes.
108
109A nonleaf spte allows the hardware mmu to reach the leaf pages and
110is not related to a translation directly. It points to other shadow pages.
111
112A leaf spte corresponds to either one or two translations encoded into
113one paging structure entry. These are always the lowest level of the
114translation stack, with optional higher level translations left to NPT/EPT.
115Leaf ptes point at guest pages.
116
117The following table shows translations encoded by leaf ptes, with higher-level
118translations in parentheses:
119
120 Non-nested guests:
121 nonpaging: gpa->hpa
122 paging: gva->gpa->hpa
123 paging, tdp: (gva->)gpa->hpa
124 Nested guests:
125 non-tdp: ngva->gpa->hpa (*)
126 tdp: (ngva->)ngpa->gpa->hpa
127
128(*) the guest hypervisor will encode the ngva->gpa translation into its page
129 tables if npt is not present
130
131Shadow pages contain the following information:
132 role.level:
133 The level in the shadow paging hierarchy that this shadow page belongs to.
134 1=4k sptes, 2=2M sptes, 3=1G sptes, etc.
135 role.direct:
136 If set, leaf sptes reachable from this page are for a linear range.
137 Examples include real mode translation, large guest pages backed by small
138 host pages, and gpa->hpa translations when NPT or EPT is active.
139 The linear range starts at (gfn << PAGE_SHIFT) and its size is determined
140 by role.level (2MB for first level, 1GB for second level, 0.5TB for third
141 level, 256TB for fourth level)
142 If clear, this page corresponds to a guest page table denoted by the gfn
143 field.
144 role.quadrant:
145 When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit
146 sptes. That means a guest page table contains more ptes than the host,
147 so multiple shadow pages are needed to shadow one guest page.
148 For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
149 first or second 512-gpte block in the guest page table. For second-level
150 page tables, each 32-bit gpte is converted to two 64-bit sptes
151 (since each first-level guest page is shadowed by two first-level
152 shadow pages) so role.quadrant takes values in the range 0..3. Each
153 quadrant maps 1GB virtual address space.
154 role.access:
155 Inherited guest access permissions in the form uwx. Note execute
156 permission is positive, not negative.
157 role.invalid:
158 The page is invalid and should not be used. It is a root page that is
159 currently pinned (by a cpu hardware register pointing to it); once it is
160 unpinned it will be destroyed.
161 role.cr4_pae:
162 Contains the value of cr4.pae for which the page is valid (e.g. whether
163 32-bit or 64-bit gptes are in use).
164 role.cr4_nxe:
165 Contains the value of efer.nxe for which the page is valid.
166 role.cr0_wp:
167 Contains the value of cr0.wp for which the page is valid.
168 gfn:
169 Either the guest page table containing the translations shadowed by this
170 page, or the base page frame for linear translations. See role.direct.
171 spt:
172 A pageful of 64-bit sptes containing the translations for this page.
173 Accessed by both kvm and hardware.
174 The page pointed to by spt will have its page->private pointing back
175 at the shadow page structure.
176 sptes in spt point either at guest pages, or at lower-level shadow pages.
177 Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point
178 at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte.
179 The spt array forms a DAG structure with the shadow page as a node, and
180 guest pages as leaves.
181 gfns:
182 An array of 512 guest frame numbers, one for each present pte. Used to
183 perform a reverse map from a pte to a gfn.
184 slot_bitmap:
185 A bitmap containing one bit per memory slot. If the page contains a pte
186 mapping a page from memory slot n, then bit n of slot_bitmap will be set
187 (if a page is aliased among several slots, then it is not guaranteed that
188 all slots will be marked).
189 Used during dirty logging to avoid scanning a shadow page if none if its
190 pages need tracking.
191 root_count:
192 A counter keeping track of how many hardware registers (guest cr3 or
193 pdptrs) are now pointing at the page. While this counter is nonzero, the
194 page cannot be destroyed. See role.invalid.
195 multimapped:
196 Whether there exist multiple sptes pointing at this page.
197 parent_pte/parent_ptes:
198 If multimapped is zero, parent_pte points at the single spte that points at
199 this page's spt. Otherwise, parent_ptes points at a data structure
200 with a list of parent_ptes.
201 unsync:
202 If true, then the translations in this page may not match the guest's
203 translation. This is equivalent to the state of the tlb when a pte is
204 changed but before the tlb entry is flushed. Accordingly, unsync ptes
205 are synchronized when the guest executes invlpg or flushes its tlb by
206 other means. Valid for leaf pages.
207 unsync_children:
208 How many sptes in the page point at pages that are unsync (or have
209 unsynchronized children).
210 unsync_child_bitmap:
211 A bitmap indicating which sptes in spt point (directly or indirectly) at
212 pages that may be unsynchronized. Used to quickly locate all unsychronized
213 pages reachable from a given page.
214
215Reverse map
216===========
217
218The mmu maintains a reverse mapping whereby all ptes mapping a page can be
219reached given its gfn. This is used, for example, when swapping out a page.
220
221Synchronized and unsynchronized pages
222=====================================
223
224The guest uses two events to synchronize its tlb and page tables: tlb flushes
225and page invalidations (invlpg).
226
227A tlb flush means that we need to synchronize all sptes reachable from the
228guest's cr3. This is expensive, so we keep all guest page tables write
229protected, and synchronize sptes to gptes when a gpte is written.
230
231A special case is when a guest page table is reachable from the current
232guest cr3. In this case, the guest is obliged to issue an invlpg instruction
233before using the translation. We take advantage of that by removing write
234protection from the guest page, and allowing the guest to modify it freely.
235We synchronize modified gptes when the guest invokes invlpg. This reduces
236the amount of emulation we have to do when the guest modifies multiple gptes,
237or when the a guest page is no longer used as a page table and is used for
238random guest data.
239
240As a side effect we have to resynchronize all reachable unsynchronized shadow
241pages on a tlb flush.
242
243
244Reaction to events
245==================
246
247- guest page fault (or npt page fault, or ept violation)
248
249This is the most complicated event. The cause of a page fault can be:
250
251 - a true guest fault (the guest translation won't allow the access) (*)
252 - access to a missing translation
253 - access to a protected translation
254 - when logging dirty pages, memory is write protected
255 - synchronized shadow pages are write protected (*)
256 - access to untranslatable memory (mmio)
257
258 (*) not applicable in direct mode
259
260Handling a page fault is performed as follows:
261
262 - if needed, walk the guest page tables to determine the guest translation
263 (gva->gpa or ngpa->gpa)
264 - if permissions are insufficient, reflect the fault back to the guest
265 - determine the host page
266 - if this is an mmio request, there is no host page; call the emulator
267 to emulate the instruction instead
268 - walk the shadow page table to find the spte for the translation,
269 instantiating missing intermediate page tables as necessary
270 - try to unsynchronize the page
271 - if successful, we can let the guest continue and modify the gpte
272 - emulate the instruction
273 - if failed, unshadow the page and let the guest continue
274 - update any translations that were modified by the instruction
275
276invlpg handling:
277
278 - walk the shadow page hierarchy and drop affected translations
279 - try to reinstantiate the indicated translation in the hope that the
280 guest will use it in the near future
281
282Guest control register updates:
283
284- mov to cr3
285 - look up new shadow roots
286 - synchronize newly reachable shadow pages
287
288- mov to cr0/cr4/efer
289 - set up mmu context for new paging mode
290 - look up new shadow roots
291 - synchronize newly reachable shadow pages
292
293Host translation updates:
294
295 - mmu notifier called with updated hva
296 - look up affected sptes through reverse map
297 - drop (or update) translations
298
299Further reading
300===============
301
302- NPT presentation from KVM Forum 2008
303 http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf
304
diff --git a/Documentation/laptops/00-INDEX b/Documentation/laptops/00-INDEX
index ee5692b26dd4..fa688538e757 100644
--- a/Documentation/laptops/00-INDEX
+++ b/Documentation/laptops/00-INDEX
@@ -2,6 +2,12 @@
2 - This file 2 - This file
3acer-wmi.txt 3acer-wmi.txt
4 - information on the Acer Laptop WMI Extras driver. 4 - information on the Acer Laptop WMI Extras driver.
5asus-laptop.txt
6 - information on the Asus Laptop Extras driver.
7disk-shock-protection.txt
8 - information on hard disk shock protection.
9dslm.c
10 - Simple Disk Sleep Monitor program
5laptop-mode.txt 11laptop-mode.txt
6 - how to conserve battery power using laptop-mode. 12 - how to conserve battery power using laptop-mode.
7sony-laptop.txt 13sony-laptop.txt
diff --git a/Documentation/laptops/Makefile b/Documentation/laptops/Makefile
new file mode 100644
index 000000000000..5cb144af3c09
--- /dev/null
+++ b/Documentation/laptops/Makefile
@@ -0,0 +1,8 @@
1# kbuild trick to avoid linker error. Can be omitted if a module is built.
2obj- := dummy.o
3
4# List of programs to build
5hostprogs-y := dslm
6
7# Tell kbuild to always build the programs
8always := $(hostprogs-y)
diff --git a/Documentation/laptops/dslm.c b/Documentation/laptops/dslm.c
new file mode 100644
index 000000000000..72ff290c5fc6
--- /dev/null
+++ b/Documentation/laptops/dslm.c
@@ -0,0 +1,166 @@
1/*
2 * dslm.c
3 * Simple Disk Sleep Monitor
4 * by Bartek Kania
5 * Licenced under the GPL
6 */
7#include <unistd.h>
8#include <stdlib.h>
9#include <stdio.h>
10#include <fcntl.h>
11#include <errno.h>
12#include <time.h>
13#include <string.h>
14#include <signal.h>
15#include <sys/ioctl.h>
16#include <linux/hdreg.h>
17
18#ifdef DEBUG
19#define D(x) x
20#else
21#define D(x)
22#endif
23
24int endit = 0;
25
26/* Check if the disk is in powersave-mode
27 * Most of the code is stolen from hdparm.
28 * 1 = active, 0 = standby/sleep, -1 = unknown */
29static int check_powermode(int fd)
30{
31 unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
32 int state;
33
34 if (ioctl(fd, HDIO_DRIVE_CMD, &args)
35 && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
36 && ioctl(fd, HDIO_DRIVE_CMD, &args)) {
37 if (errno != EIO || args[0] != 0 || args[1] != 0) {
38 state = -1; /* "unknown"; */
39 } else
40 state = 0; /* "sleeping"; */
41 } else {
42 state = (args[2] == 255) ? 1 : 0;
43 }
44 D(printf(" drive state is: %d\n", state));
45
46 return state;
47}
48
49static char *state_name(int i)
50{
51 if (i == -1) return "unknown";
52 if (i == 0) return "sleeping";
53 if (i == 1) return "active";
54
55 return "internal error";
56}
57
58static char *myctime(time_t time)
59{
60 char *ts = ctime(&time);
61 ts[strlen(ts) - 1] = 0;
62
63 return ts;
64}
65
66static void measure(int fd)
67{
68 time_t start_time;
69 int last_state;
70 time_t last_time;
71 int curr_state;
72 time_t curr_time = 0;
73 time_t time_diff;
74 time_t active_time = 0;
75 time_t sleep_time = 0;
76 time_t unknown_time = 0;
77 time_t total_time = 0;
78 int changes = 0;
79 float tmp;
80
81 printf("Starting measurements\n");
82
83 last_state = check_powermode(fd);
84 start_time = last_time = time(0);
85 printf(" System is in state %s\n\n", state_name(last_state));
86
87 while(!endit) {
88 sleep(1);
89 curr_state = check_powermode(fd);
90
91 if (curr_state != last_state || endit) {
92 changes++;
93 curr_time = time(0);
94 time_diff = curr_time - last_time;
95
96 if (last_state == 1) active_time += time_diff;
97 else if (last_state == 0) sleep_time += time_diff;
98 else unknown_time += time_diff;
99
100 last_state = curr_state;
101 last_time = curr_time;
102
103 printf("%s: State-change to %s\n", myctime(curr_time),
104 state_name(curr_state));
105 }
106 }
107 changes--; /* Compensate for SIGINT */
108
109 total_time = time(0) - start_time;
110 printf("\nTotal running time: %lus\n", curr_time - start_time);
111 printf(" State changed %d times\n", changes);
112
113 tmp = (float)sleep_time / (float)total_time * 100;
114 printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp);
115 tmp = (float)active_time / (float)total_time * 100;
116 printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp);
117 tmp = (float)unknown_time / (float)total_time * 100;
118 printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
119}
120
121static void ender(int s)
122{
123 endit = 1;
124}
125
126static void usage(void)
127{
128 puts("usage: dslm [-w <time>] <disk>");
129 exit(0);
130}
131
132int main(int argc, char **argv)
133{
134 int fd;
135 char *disk = 0;
136 int settle_time = 60;
137
138 /* Parse the simple command-line */
139 if (argc == 2)
140 disk = argv[1];
141 else if (argc == 4) {
142 settle_time = atoi(argv[2]);
143 disk = argv[3];
144 } else
145 usage();
146
147 if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
148 printf("Can't open %s, because: %s\n", disk, strerror(errno));
149 exit(-1);
150 }
151
152 if (settle_time) {
153 printf("Waiting %d seconds for the system to settle down to "
154 "'normal'\n", settle_time);
155 sleep(settle_time);
156 } else
157 puts("Not waiting for system to settle down");
158
159 signal(SIGINT, ender);
160
161 measure(fd);
162
163 close(fd);
164
165 return 0;
166}
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
index eeedee11c8c2..0bf25eebce94 100644
--- a/Documentation/laptops/laptop-mode.txt
+++ b/Documentation/laptops/laptop-mode.txt
@@ -207,7 +207,7 @@ Tips & Tricks
207* Drew Scott Daniels observed: "I don't know why, but when I decrease the number 207* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
208 of colours that my display uses it consumes less battery power. I've seen 208 of colours that my display uses it consumes less battery power. I've seen
209 this on powerbooks too. I hope that this is a piece of information that 209 this on powerbooks too. I hope that this is a piece of information that
210 might be useful to the Laptop Mode patch or it's users." 210 might be useful to the Laptop Mode patch or its users."
211 211
212* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the 212* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
213 file after every logging. When you're using laptop-mode and your disk doesn't 213 file after every logging. When you're using laptop-mode and your disk doesn't
@@ -779,172 +779,4 @@ Monitoring tool
779--------------- 779---------------
780 780
781Bartek Kania submitted this, it can be used to measure how much time your disk 781Bartek Kania submitted this, it can be used to measure how much time your disk
782spends spun up/down. 782spends spun up/down. See Documentation/laptops/dslm.c
783
784---------------------------dslm.c BEGIN-----------------------------------------
785/*
786 * Simple Disk Sleep Monitor
787 * by Bartek Kania
788 * Licenced under the GPL
789 */
790#include <unistd.h>
791#include <stdlib.h>
792#include <stdio.h>
793#include <fcntl.h>
794#include <errno.h>
795#include <time.h>
796#include <string.h>
797#include <signal.h>
798#include <sys/ioctl.h>
799#include <linux/hdreg.h>
800
801#ifdef DEBUG
802#define D(x) x
803#else
804#define D(x)
805#endif
806
807int endit = 0;
808
809/* Check if the disk is in powersave-mode
810 * Most of the code is stolen from hdparm.
811 * 1 = active, 0 = standby/sleep, -1 = unknown */
812int check_powermode(int fd)
813{
814 unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
815 int state;
816
817 if (ioctl(fd, HDIO_DRIVE_CMD, &args)
818 && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
819 && ioctl(fd, HDIO_DRIVE_CMD, &args)) {
820 if (errno != EIO || args[0] != 0 || args[1] != 0) {
821 state = -1; /* "unknown"; */
822 } else
823 state = 0; /* "sleeping"; */
824 } else {
825 state = (args[2] == 255) ? 1 : 0;
826 }
827 D(printf(" drive state is: %d\n", state));
828
829 return state;
830}
831
832char *state_name(int i)
833{
834 if (i == -1) return "unknown";
835 if (i == 0) return "sleeping";
836 if (i == 1) return "active";
837
838 return "internal error";
839}
840
841char *myctime(time_t time)
842{
843 char *ts = ctime(&time);
844 ts[strlen(ts) - 1] = 0;
845
846 return ts;
847}
848
849void measure(int fd)
850{
851 time_t start_time;
852 int last_state;
853 time_t last_time;
854 int curr_state;
855 time_t curr_time = 0;
856 time_t time_diff;
857 time_t active_time = 0;
858 time_t sleep_time = 0;
859 time_t unknown_time = 0;
860 time_t total_time = 0;
861 int changes = 0;
862 float tmp;
863
864 printf("Starting measurements\n");
865
866 last_state = check_powermode(fd);
867 start_time = last_time = time(0);
868 printf(" System is in state %s\n\n", state_name(last_state));
869
870 while(!endit) {
871 sleep(1);
872 curr_state = check_powermode(fd);
873
874 if (curr_state != last_state || endit) {
875 changes++;
876 curr_time = time(0);
877 time_diff = curr_time - last_time;
878
879 if (last_state == 1) active_time += time_diff;
880 else if (last_state == 0) sleep_time += time_diff;
881 else unknown_time += time_diff;
882
883 last_state = curr_state;
884 last_time = curr_time;
885
886 printf("%s: State-change to %s\n", myctime(curr_time),
887 state_name(curr_state));
888 }
889 }
890 changes--; /* Compensate for SIGINT */
891
892 total_time = time(0) - start_time;
893 printf("\nTotal running time: %lus\n", curr_time - start_time);
894 printf(" State changed %d times\n", changes);
895
896 tmp = (float)sleep_time / (float)total_time * 100;
897 printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp);
898 tmp = (float)active_time / (float)total_time * 100;
899 printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp);
900 tmp = (float)unknown_time / (float)total_time * 100;
901 printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
902}
903
904void ender(int s)
905{
906 endit = 1;
907}
908
909void usage()
910{
911 puts("usage: dslm [-w <time>] <disk>");
912 exit(0);
913}
914
915int main(int argc, char **argv)
916{
917 int fd;
918 char *disk = 0;
919 int settle_time = 60;
920
921 /* Parse the simple command-line */
922 if (argc == 2)
923 disk = argv[1];
924 else if (argc == 4) {
925 settle_time = atoi(argv[2]);
926 disk = argv[3];
927 } else
928 usage();
929
930 if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
931 printf("Can't open %s, because: %s\n", disk, strerror(errno));
932 exit(-1);
933 }
934
935 if (settle_time) {
936 printf("Waiting %d seconds for the system to settle down to "
937 "'normal'\n", settle_time);
938 sleep(settle_time);
939 } else
940 puts("Not waiting for system to settle down");
941
942 signal(SIGINT, ender);
943
944 measure(fd);
945
946 close(fd);
947
948 return 0;
949}
950---------------------------dslm.c END-------------------------------------------
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 75afa1229fd7..fc15538d8b46 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -292,13 +292,13 @@ sysfs notes:
292 292
293 Warning: when in NVRAM mode, the volume up/down/mute 293 Warning: when in NVRAM mode, the volume up/down/mute
294 keys are synthesized according to changes in the mixer, 294 keys are synthesized according to changes in the mixer,
295 so you have to use volume up or volume down to unmute, 295 which uses a single volume up or volume down hotkey
296 as per the ThinkPad volume mixer user interface. When 296 press to unmute, as per the ThinkPad volume mixer user
297 in ACPI event mode, volume up/down/mute are reported as 297 interface. When in ACPI event mode, volume up/down/mute
298 separate events, but this behaviour may be corrected in 298 events are reported by the firmware and can behave
299 future releases of this driver, in which case the 299 differently (and that behaviour changes with firmware
300 ThinkPad volume mixer user interface semantics will be 300 version -- not just with firmware models -- as well as
301 enforced. 301 OSI(Linux) state).
302 302
303 hotkey_poll_freq: 303 hotkey_poll_freq:
304 frequency in Hz for hot key polling. It must be between 304 frequency in Hz for hot key polling. It must be between
@@ -309,7 +309,7 @@ sysfs notes:
309 will cause hot key presses that require NVRAM polling 309 will cause hot key presses that require NVRAM polling
310 to never be reported. 310 to never be reported.
311 311
312 Setting hotkey_poll_freq too low will cause repeated 312 Setting hotkey_poll_freq too low may cause repeated
313 pressings of the same hot key to be misreported as a 313 pressings of the same hot key to be misreported as a
314 single key press, or to not even be detected at all. 314 single key press, or to not even be detected at all.
315 The recommended polling frequency is 10Hz. 315 The recommended polling frequency is 10Hz.
@@ -397,6 +397,7 @@ ACPI Scan
397event code Key Notes 397event code Key Notes
398 398
3990x1001 0x00 FN+F1 - 3990x1001 0x00 FN+F1 -
400
4000x1002 0x01 FN+F2 IBM: battery (rare) 4010x1002 0x01 FN+F2 IBM: battery (rare)
401 Lenovo: Screen lock 402 Lenovo: Screen lock
402 403
@@ -404,7 +405,8 @@ event code Key Notes
404 this hot key, even with hot keys 405 this hot key, even with hot keys
405 disabled or with Fn+F3 masked 406 disabled or with Fn+F3 masked
406 off 407 off
407 IBM: screen lock 408 IBM: screen lock, often turns
409 off the ThinkLight as side-effect
408 Lenovo: battery 410 Lenovo: battery
409 411
4100x1004 0x03 FN+F4 Sleep button (ACPI sleep button 4120x1004 0x03 FN+F4 Sleep button (ACPI sleep button
@@ -433,7 +435,8 @@ event code Key Notes
433 Do you feel lucky today? 435 Do you feel lucky today?
434 436
4350x1008 0x07 FN+F8 IBM: toggle screen expand 4370x1008 0x07 FN+F8 IBM: toggle screen expand
436 Lenovo: configure UltraNav 438 Lenovo: configure UltraNav,
439 or toggle screen expand
437 440
4380x1009 0x08 FN+F9 - 4410x1009 0x08 FN+F9 -
439 .. .. .. 442 .. .. ..
@@ -444,7 +447,7 @@ event code Key Notes
444 either through the ACPI event, 447 either through the ACPI event,
445 or through a hotkey event. 448 or through a hotkey event.
446 The firmware may refuse to 449 The firmware may refuse to
447 generate further FN+F4 key 450 generate further FN+F12 key
448 press events until a S3 or S4 451 press events until a S3 or S4
449 ACPI sleep cycle is performed, 452 ACPI sleep cycle is performed,
450 or some time passes. 453 or some time passes.
@@ -512,15 +515,19 @@ events for switches:
512SW_RFKILL_ALL T60 and later hardware rfkill rocker switch 515SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
513SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A 516SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
514 517
515Non hot-key ACPI HKEY event map: 518Non hotkey ACPI HKEY event map:
519-------------------------------
520
521Events that are not propagated by the driver, except for legacy
522compatibility purposes when hotkey_report_mode is set to 1:
523
5160x5001 Lid closed 5240x5001 Lid closed
5170x5002 Lid opened 5250x5002 Lid opened
5180x5009 Tablet swivel: switched to tablet mode 5260x5009 Tablet swivel: switched to tablet mode
5190x500A Tablet swivel: switched to normal mode 5270x500A Tablet swivel: switched to normal mode
5200x7000 Radio Switch may have changed state 5280x7000 Radio Switch may have changed state
521 529
522The above events are not propagated by the driver, except for legacy 530Events that are never propagated by the driver:
523compatibility purposes when hotkey_report_mode is set to 1.
524 531
5250x2304 System is waking up from suspend to undock 5320x2304 System is waking up from suspend to undock
5260x2305 System is waking up from suspend to eject bay 5330x2305 System is waking up from suspend to eject bay
@@ -528,14 +535,39 @@ compatibility purposes when hotkey_report_mode is set to 1.
5280x2405 System is waking up from hibernation to eject bay 5350x2405 System is waking up from hibernation to eject bay
5290x5010 Brightness level changed/control event 5360x5010 Brightness level changed/control event
530 537
531The above events are never propagated by the driver. 538Events that are propagated by the driver to userspace:
532 539
5400x2313 ALARM: System is waking up from suspend because
541 the battery is nearly empty
5420x2413 ALARM: System is waking up from hibernation because
543 the battery is nearly empty
5330x3003 Bay ejection (see 0x2x05) complete, can sleep again 5440x3003 Bay ejection (see 0x2x05) complete, can sleep again
5450x3006 Bay hotplug request (hint to power up SATA link when
546 the optical drive tray is ejected)
5340x4003 Undocked (see 0x2x04), can sleep again 5470x4003 Undocked (see 0x2x04), can sleep again
5350x500B Tablet pen inserted into its storage bay 5480x500B Tablet pen inserted into its storage bay
5360x500C Tablet pen removed from its storage bay 5490x500C Tablet pen removed from its storage bay
537 5500x6011 ALARM: battery is too hot
538The above events are propagated by the driver. 5510x6012 ALARM: battery is extremely hot
5520x6021 ALARM: a sensor is too hot
5530x6022 ALARM: a sensor is extremely hot
5540x6030 System thermal table changed
555
556Battery nearly empty alarms are a last resort attempt to get the
557operating system to hibernate or shutdown cleanly (0x2313), or shutdown
558cleanly (0x2413) before power is lost. They must be acted upon, as the
559wake up caused by the firmware will have negated most safety nets...
560
561When any of the "too hot" alarms happen, according to Lenovo the user
562should suspend or hibernate the laptop (and in the case of battery
563alarms, unplug the AC adapter) to let it cool down. These alarms do
564signal that something is wrong, they should never happen on normal
565operating conditions.
566
567The "extremely hot" alarms are emergencies. According to Lenovo, the
568operating system is to force either an immediate suspend or hibernate
569cycle, or a system shutdown. Obviously, something is very wrong if this
570happens.
539 571
540Compatibility notes: 572Compatibility notes:
541 573
@@ -650,6 +682,10 @@ LCD, CRT or DVI (if available). The following commands are available:
650 echo expand_toggle > /proc/acpi/ibm/video 682 echo expand_toggle > /proc/acpi/ibm/video
651 echo video_switch > /proc/acpi/ibm/video 683 echo video_switch > /proc/acpi/ibm/video
652 684
685NOTE: Access to this feature is restricted to processes owning the
686CAP_SYS_ADMIN capability for safety reasons, as it can interact badly
687enough with some versions of X.org to crash it.
688
653Each video output device can be enabled or disabled individually. 689Each video output device can be enabled or disabled individually.
654Reading /proc/acpi/ibm/video shows the status of each device. 690Reading /proc/acpi/ibm/video shows the status of each device.
655 691
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 42208511b5c0..e9ce3c554514 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -34,7 +34,6 @@
34#include <sys/uio.h> 34#include <sys/uio.h>
35#include <termios.h> 35#include <termios.h>
36#include <getopt.h> 36#include <getopt.h>
37#include <zlib.h>
38#include <assert.h> 37#include <assert.h>
39#include <sched.h> 38#include <sched.h>
40#include <limits.h> 39#include <limits.h>
@@ -264,7 +263,7 @@ static u8 *get_feature_bits(struct device *dev)
264 * Launcher virtual with an offset. 263 * Launcher virtual with an offset.
265 * 264 *
266 * This can be tough to get your head around, but usually it just means that we 265 * This can be tough to get your head around, but usually it just means that we
267 * use these trivial conversion functions when the Guest gives us it's 266 * use these trivial conversion functions when the Guest gives us its
268 * "physical" addresses: 267 * "physical" addresses:
269 */ 268 */
270static void *from_guest_phys(unsigned long addr) 269static void *from_guest_phys(unsigned long addr)
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 188f4768f1d5..e4e893ef3e01 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -136,7 +136,7 @@ raid_disks != 0.
136 136
137Then uninitialized devices can be added with ADD_NEW_DISK. The 137Then uninitialized devices can be added with ADD_NEW_DISK. The
138structure passed to ADD_NEW_DISK must specify the state of the device 138structure passed to ADD_NEW_DISK must specify the state of the device
139and it's role in the array. 139and its role in the array.
140 140
141Once started with RUN_ARRAY, uninitialized spares can be added with 141Once started with RUN_ARRAY, uninitialized spares can be added with
142HOT_ADD_DISK. 142HOT_ADD_DISK.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7f5809eddee6..631ad2f1b229 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -3,6 +3,7 @@
3 ============================ 3 ============================
4 4
5By: David Howells <dhowells@redhat.com> 5By: David Howells <dhowells@redhat.com>
6 Paul E. McKenney <paulmck@linux.vnet.ibm.com>
6 7
7Contents: 8Contents:
8 9
@@ -60,6 +61,10 @@ Contents:
60 61
61 - And then there's the Alpha. 62 - And then there's the Alpha.
62 63
64 (*) Example uses.
65
66 - Circular buffers.
67
63 (*) References. 68 (*) References.
64 69
65 70
@@ -2226,6 +2231,21 @@ The Alpha defines the Linux kernel's memory barrier model.
2226See the subsection on "Cache Coherency" above. 2231See the subsection on "Cache Coherency" above.
2227 2232
2228 2233
2234============
2235EXAMPLE USES
2236============
2237
2238CIRCULAR BUFFERS
2239----------------
2240
2241Memory barriers can be used to implement circular buffering without the need
2242of a lock to serialise the producer with the consumer. See:
2243
2244 Documentation/circular-buffers.txt
2245
2246for details.
2247
2248
2229========== 2249==========
2230REFERENCES 2250REFERENCES
2231========== 2251==========
diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.txt
index 98dd9f7430f2..638c74f7de7f 100644
--- a/Documentation/netlabel/lsm_interface.txt
+++ b/Documentation/netlabel/lsm_interface.txt
@@ -38,7 +38,7 @@ Depending on the exact configuration, translation between the network packet
38label and the internal LSM security identifier can be time consuming. The 38label and the internal LSM security identifier can be time consuming. The
39NetLabel label mapping cache is a caching mechanism which can be used to 39NetLabel label mapping cache is a caching mechanism which can be used to
40sidestep much of this overhead once a mapping has been established. Once the 40sidestep much of this overhead once a mapping has been established. Once the
41LSM has received a packet, used NetLabel to decode it's security attributes, 41LSM has received a packet, used NetLabel to decode its security attributes,
42and translated the security attributes into a LSM internal identifier the LSM 42and translated the security attributes into a LSM internal identifier the LSM
43can use the NetLabel caching functions to associate the LSM internal 43can use the NetLabel caching functions to associate the LSM internal
44identifier with the network packet's label. This means that in the future 44identifier with the network packet's label. This means that in the future
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 50189bf07d53..fe5c099b8fc8 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -32,6 +32,8 @@ cs89x0.txt
32 - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver 32 - the Crystal LAN (CS8900/20-based) Ethernet ISA adapter driver
33cxacru.txt 33cxacru.txt
34 - Conexant AccessRunner USB ADSL Modem 34 - Conexant AccessRunner USB ADSL Modem
35cxacru-cf.py
36 - Conexant AccessRunner USB ADSL Modem configuration file parser
35de4x5.txt 37de4x5.txt
36 - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver 38 - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver
37decnet.txt 39decnet.txt
diff --git a/Documentation/networking/Makefile b/Documentation/networking/Makefile
index 6d8af1ac56c4..5aba7a33aeeb 100644
--- a/Documentation/networking/Makefile
+++ b/Documentation/networking/Makefile
@@ -6,3 +6,5 @@ hostprogs-y := ifenslave
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
9
10obj-m := timestamping/
diff --git a/Documentation/networking/caif/Linux-CAIF.txt b/Documentation/networking/caif/Linux-CAIF.txt
new file mode 100644
index 000000000000..7fe7a9a33a4f
--- /dev/null
+++ b/Documentation/networking/caif/Linux-CAIF.txt
@@ -0,0 +1,212 @@
1Linux CAIF
2===========
3copyright (C) ST-Ericsson AB 2010
4Author: Sjur Brendeland/ sjur.brandeland@stericsson.com
5License terms: GNU General Public License (GPL) version 2
6
7
8Introduction
9------------
10CAIF is a MUX protocol used by ST-Ericsson cellular modems for
11communication between Modem and host. The host processes can open virtual AT
12channels, initiate GPRS Data connections, Video channels and Utility Channels.
13The Utility Channels are general purpose pipes between modem and host.
14
15ST-Ericsson modems support a number of transports between modem
16and host. Currently, UART and Loopback are available for Linux.
17
18
19Architecture:
20------------
21The implementation of CAIF is divided into:
22* CAIF Socket Layer, Kernel API, and Net Device.
23* CAIF Core Protocol Implementation
24* CAIF Link Layer, implemented as NET devices.
25
26
27 RTNL
28 !
29 ! +------+ +------+ +------+
30 ! +------+! +------+! +------+!
31 ! ! Sock !! !Kernel!! ! Net !!
32 ! ! API !+ ! API !+ ! Dev !+ <- CAIF Client APIs
33 ! +------+ +------! +------+
34 ! ! ! !
35 ! +----------!----------+
36 ! +------+ <- CAIF Protocol Implementation
37 +-------> ! CAIF !
38 ! Core !
39 +------+
40 +--------!--------+
41 ! !
42 +------+ +-----+
43 ! ! ! TTY ! <- Link Layer (Net Devices)
44 +------+ +-----+
45
46
47Using the Kernel API
48----------------------
49The Kernel API is used for accessing CAIF channels from the
50kernel.
51The user of the API has to implement two callbacks for receive
52and control.
53The receive callback gives a CAIF packet as a SKB. The control
54callback will
55notify of channel initialization complete, and flow-on/flow-
56off.
57
58
59 struct caif_device caif_dev = {
60 .caif_config = {
61 .name = "MYDEV"
62 .type = CAIF_CHTY_AT
63 }
64 .receive_cb = my_receive,
65 .control_cb = my_control,
66 };
67 caif_add_device(&caif_dev);
68 caif_transmit(&caif_dev, skb);
69
70See the caif_kernel.h for details about the CAIF kernel API.
71
72
73I M P L E M E N T A T I O N
74===========================
75===========================
76
77CAIF Core Protocol Layer
78=========================================
79
80CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson.
81It implements the CAIF protocol stack in a layered approach, where
82each layer described in the specification is implemented as a separate layer.
83The architecture is inspired by the design patterns "Protocol Layer" and
84"Protocol Packet".
85
86== CAIF structure ==
87The Core CAIF implementation contains:
88 - Simple implementation of CAIF.
89 - Layered architecture (a la Streams), each layer in the CAIF
90 specification is implemented in a separate c-file.
91 - Clients must implement PHY layer to access physical HW
92 with receive and transmit functions.
93 - Clients must call configuration function to add PHY layer.
94 - Clients must implement CAIF layer to consume/produce
95 CAIF payload with receive and transmit functions.
96 - Clients must call configuration function to add and connect the
97 Client layer.
98 - When receiving / transmitting CAIF Packets (cfpkt), ownership is passed
99 to the called function (except for framing layers' receive functions
100 or if a transmit function returns an error, in which case the caller
101 must free the packet).
102
103Layered Architecture
104--------------------
105The CAIF protocol can be divided into two parts: Support functions and Protocol
106Implementation. The support functions include:
107
108 - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The
109 CAIF Packet has functions for creating, destroying and adding content
110 and for adding/extracting header and trailers to protocol packets.
111
112 - CFLST CAIF list implementation.
113
114 - CFGLUE CAIF Glue. Contains OS Specifics, such as memory
115 allocation, endianness, etc.
116
117The CAIF Protocol implementation contains:
118
119 - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol
120 Stack and provides a Client interface for adding Link-Layer and
121 Driver interfaces on top of the CAIF Stack.
122
123 - CFCTRL CAIF Control layer. Encodes and Decodes control messages
124 such as enumeration and channel setup. Also matches request and
125 response messages.
126
127 - CFSERVL General CAIF Service Layer functionality; handles flow
128 control and remote shutdown requests.
129
130 - CFVEI CAIF VEI layer. Handles CAIF AT Channels on VEI (Virtual
131 External Interface). This layer encodes/decodes VEI frames.
132
133 - CFDGML CAIF Datagram layer. Handles CAIF Datagram layer (IP
134 traffic), encodes/decodes Datagram frames.
135
136 - CFMUX CAIF Mux layer. Handles multiplexing between multiple
137 physical bearers and multiple channels such as VEI, Datagram, etc.
138 The MUX keeps track of the existing CAIF Channels and
139 Physical Instances and selects the apropriate instance based
140 on Channel-Id and Physical-ID.
141
142 - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length
143 and frame checksum.
144
145 - CFSERL CAIF Serial layer. Handles concatenation/split of frames
146 into CAIF Frames with correct length.
147
148
149
150 +---------+
151 | Config |
152 | CFCNFG |
153 +---------+
154 !
155 +---------+ +---------+ +---------+
156 | AT | | Control | | Datagram|
157 | CFVEIL | | CFCTRL | | CFDGML |
158 +---------+ +---------+ +---------+
159 \_____________!______________/
160 !
161 +---------+
162 | MUX |
163 | |
164 +---------+
165 _____!_____
166 / \
167 +---------+ +---------+
168 | CFFRML | | CFFRML |
169 | Framing | | Framing |
170 +---------+ +---------+
171 ! !
172 +---------+ +---------+
173 | | | Serial |
174 | | | CFSERL |
175 +---------+ +---------+
176
177
178In this layered approach the following "rules" apply.
179 - All layers embed the same structure "struct cflayer"
180 - A layer does not depend on any other layer's private data.
181 - Layers are stacked by setting the pointers
182 layer->up , layer->dn
183 - In order to send data upwards, each layer should do
184 layer->up->receive(layer->up, packet);
185 - In order to send data downwards, each layer should do
186 layer->dn->transmit(layer->dn, packet);
187
188
189Linux Driver Implementation
190===========================
191
192Linux GPRS Net Device and CAIF socket are implemented on top of the
193CAIF Core protocol. The Net device and CAIF socket have an instance of
194'struct cflayer', just like the CAIF Core protocol stack.
195Net device and Socket implement the 'receive()' function defined by
196'struct cflayer', just like the rest of the CAIF stack. In this way, transmit and
197receive of packets is handled as by the rest of the layers: the 'dn->transmit()'
198function is called in order to transmit data.
199
200The layer on top of the CAIF Core implementation is
201sometimes referred to as the "Client layer".
202
203
204Configuration of Link Layer
205---------------------------
206The Link Layer is implemented as Linux net devices (struct net_device).
207Payload handling and registration is done using standard Linux mechanisms.
208
209The CAIF Protocol relies on a loss-less link layer without implementing
210retransmission. This implies that packet drops must not happen.
211Therefore a flow-control mechanism is implemented where the physical
212interface can initiate flow stop for all CAIF Channels.
diff --git a/Documentation/networking/caif/README b/Documentation/networking/caif/README
new file mode 100644
index 000000000000..757ccfaa1385
--- /dev/null
+++ b/Documentation/networking/caif/README
@@ -0,0 +1,109 @@
1Copyright (C) ST-Ericsson AB 2010
2Author: Sjur Brendeland/ sjur.brandeland@stericsson.com
3License terms: GNU General Public License (GPL) version 2
4---------------------------------------------------------
5
6=== Start ===
7If you have compiled CAIF for modules do:
8
9$modprobe crc_ccitt
10$modprobe caif
11$modprobe caif_socket
12$modprobe chnl_net
13
14
15=== Preparing the setup with a STE modem ===
16
17If you are working on integration of CAIF you should make sure
18that the kernel is built with module support.
19
20There are some things that need to be tweaked to get the host TTY correctly
21set up to talk to the modem.
22Since the CAIF stack is running in the kernel and we want to use the existing
23TTY, we are installing our physical serial driver as a line discipline above
24the TTY device.
25
26To achieve this we need to install the N_CAIF ldisc from user space.
27The benefit is that we can hook up to any TTY.
28
29The use of Start-of-frame-extension (STX) must also be set as
30module parameter "ser_use_stx".
31
32Normally Frame Checksum is always used on UART, but this is also provided as a
33module parameter "ser_use_fcs".
34
35$ modprobe caif_serial ser_ttyname=/dev/ttyS0 ser_use_stx=yes
36$ ifconfig caif_ttyS0 up
37
38PLEASE NOTE: There is a limitation in Android shell.
39 It only accepts one argument to insmod/modprobe!
40
41=== Trouble shooting ===
42
43There are debugfs parameters provided for serial communication.
44/sys/kernel/debug/caif_serial/<tty-name>/
45
46* ser_state: Prints the bit-mask status where
47 - 0x02 means SENDING, this is a transient state.
48 - 0x10 means FLOW_OFF_SENT, i.e. the previous frame has not been sent
49 and is blocking further send operation. Flow OFF has been propagated
50 to all CAIF Channels using this TTY.
51
52* tty_status: Prints the bit-mask tty status information
53 - 0x01 - tty->warned is on.
54 - 0x02 - tty->low_latency is on.
55 - 0x04 - tty->packed is on.
56 - 0x08 - tty->flow_stopped is on.
57 - 0x10 - tty->hw_stopped is on.
58 - 0x20 - tty->stopped is on.
59
60* last_tx_msg: Binary blob Prints the last transmitted frame.
61 This can be printed with
62 $od --format=x1 /sys/kernel/debug/caif_serial/<tty>/last_rx_msg.
63 The first two tx messages sent look like this. Note: The initial
64 byte 02 is start of frame extension (STX) used for re-syncing
65 upon errors.
66
67 - Enumeration:
68 0000000 02 05 00 00 03 01 d2 02
69 | | | | | |
70 STX(1) | | | |
71 Length(2)| | |
72 Control Channel(1)
73 Command:Enumeration(1)
74 Link-ID(1)
75 Checksum(2)
76 - Channel Setup:
77 0000000 02 07 00 00 00 21 a1 00 48 df
78 | | | | | | | |
79 STX(1) | | | | | |
80 Length(2)| | | | |
81 Control Channel(1)
82 Command:Channel Setup(1)
83 Channel Type(1)
84 Priority and Link-ID(1)
85 Endpoint(1)
86 Checksum(2)
87
88* last_rx_msg: Prints the last transmitted frame.
89 The RX messages for LinkSetup look almost identical but they have the
90 bit 0x20 set in the command bit, and Channel Setup has added one byte
91 before Checksum containing Channel ID.
92 NOTE: Several CAIF Messages might be concatenated. The maximum debug
93 buffer size is 128 bytes.
94
95== Error Scenarios:
96- last_tx_msg contains channel setup message and last_rx_msg is empty ->
97 The host seems to be able to send over the UART, at least the CAIF ldisc get
98 notified that sending is completed.
99
100- last_tx_msg contains enumeration message and last_rx_msg is empty ->
101 The host is not able to send the message from UART, the tty has not been
102 able to complete the transmit operation.
103
104- if /sys/kernel/debug/caif_serial/<tty>/tty_status is non-zero there
105 might be problems transmitting over UART.
106 E.g. host and modem wiring is not correct you will typically see
107 tty_status = 0x10 (hw_stopped) and ser_state = 0x10 (FLOW_OFF_SENT).
108 You will probably see the enumeration message in last_tx_message
109 and empty last_rx_message.
diff --git a/Documentation/networking/cxacru-cf.py b/Documentation/networking/cxacru-cf.py
new file mode 100644
index 000000000000..b41d298398c8
--- /dev/null
+++ b/Documentation/networking/cxacru-cf.py
@@ -0,0 +1,48 @@
1#!/usr/bin/env python
2# Copyright 2009 Simon Arlott
3#
4# This program is free software; you can redistribute it and/or modify it
5# under the terms of the GNU General Public License as published by the Free
6# Software Foundation; either version 2 of the License, or (at your option)
7# any later version.
8#
9# This program is distributed in the hope that it will be useful, but WITHOUT
10# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12# more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# this program; if not, write to the Free Software Foundation, Inc., 59
16# Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17#
18# Usage: cxacru-cf.py < cxacru-cf.bin
19# Output: values string suitable for the sysfs adsl_config attribute
20#
21# Warning: cxacru-cf.bin with MD5 hash cdbac2689969d5ed5d4850f117702110
22# contains mis-aligned values which will stop the modem from being able
23# to make a connection. If the first and last two bytes are removed then
24# the values become valid, but the modulation will be forced to ANSI
25# T1.413 only which may not be appropriate.
26#
27# The original binary format is a packed list of le32 values.
28
29import sys
30import struct
31
32i = 0
33while True:
34 buf = sys.stdin.read(4)
35
36 if len(buf) == 0:
37 break
38 elif len(buf) != 4:
39 sys.stdout.write("\n")
40 sys.stderr.write("Error: read {0} not 4 bytes\n".format(len(buf)))
41 sys.exit(1)
42
43 if i > 0:
44 sys.stdout.write(" ")
45 sys.stdout.write("{0:x}={1}".format(i, struct.unpack("<I", buf)[0]))
46 i += 1
47
48sys.stdout.write("\n")
diff --git a/Documentation/networking/cxacru.txt b/Documentation/networking/cxacru.txt
index b074681a963e..2cce04457b4d 100644
--- a/Documentation/networking/cxacru.txt
+++ b/Documentation/networking/cxacru.txt
@@ -4,6 +4,12 @@ While it is capable of managing/maintaining the ADSL connection without the
4module loaded, the device will sometimes stop responding after unloading the 4module loaded, the device will sometimes stop responding after unloading the
5driver and it is necessary to unplug/remove power to the device to fix this. 5driver and it is necessary to unplug/remove power to the device to fix this.
6 6
7Note: support for cxacru-cf.bin has been removed. It was not loaded correctly
8so it had no effect on the device configuration. Fixing it could have stopped
9existing devices working when an invalid configuration is supplied.
10
11There is a script cxacru-cf.py to convert an existing file to the sysfs form.
12
7Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/ 13Detected devices will appear as ATM devices named "cxacru". In /sys/class/atm/
8these are directories named cxacruN where N is the device number. A symlink 14these are directories named cxacruN where N is the device number. A symlink
9named device points to the USB interface device's directory which contains 15named device points to the USB interface device's directory which contains
@@ -15,6 +21,15 @@ several sysfs attribute files for retrieving device statistics:
15* adsl_headend_environment 21* adsl_headend_environment
16 Information about the remote headend. 22 Information about the remote headend.
17 23
24* adsl_config
25 Configuration writing interface.
26 Write parameters in hexadecimal format <index>=<value>,
27 separated by whitespace, e.g.:
28 "1=0 a=5"
29 Up to 7 parameters at a time will be sent and the modem will restart
30 the ADSL connection when any value is set. These are logged for future
31 reference.
32
18* downstream_attenuation (dB) 33* downstream_attenuation (dB)
19* downstream_bits_per_frame 34* downstream_bits_per_frame
20* downstream_rate (kbps) 35* downstream_rate (kbps)
@@ -61,6 +76,7 @@ several sysfs attribute files for retrieving device statistics:
61* mac_address 76* mac_address
62 77
63* modulation 78* modulation
79 "" (when not connected)
64 "ANSI T1.413" 80 "ANSI T1.413"
65 "ITU-T G.992.1 (G.DMT)" 81 "ITU-T G.992.1 (G.DMT)"
66 "ITU-T G.992.2 (G.LITE)" 82 "ITU-T G.992.2 (G.LITE)"
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index b132e4a3cf0f..a62fdf7a6bff 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -58,8 +58,10 @@ DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
58size (application payload size) in bytes, see RFC 4340, section 14. 58size (application payload size) in bytes, see RFC 4340, section 14.
59 59
60DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs 60DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
61supported by the endpoint (see include/linux/dccp.h for symbolic constants). 61supported by the endpoint. The option value is an array of type uint8_t whose
62The caller needs to provide a sufficiently large (> 2) array of type uint8_t. 62size is passed as option length. The minimum array size is 4 elements, the
63value returned in the optlen argument always reflects the true number of
64built-in CCIDs.
63 65
64DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same 66DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
65time, combining the operation of the next two socket options. This option is 67time, combining the operation of the next two socket options. This option is
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 1b96ccda3836..2bac9618c345 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -756,7 +756,7 @@ static int enslave(char *master_ifname, char *slave_ifname)
756 */ 756 */
757 if (abi_ver < 1) { 757 if (abi_ver < 1) {
758 /* For old ABI, the master needs to be 758 /* For old ABI, the master needs to be
759 * down before setting it's hwaddr 759 * down before setting its hwaddr
760 */ 760 */
761 res = set_if_down(master_ifname, master_flags.ifr_flags); 761 res = set_if_down(master_ifname, master_flags.ifr_flags);
762 if (res) { 762 if (res) {
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e87f3cdc8a6a..d0536b5a4e01 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -487,6 +487,30 @@ tcp_dma_copybreak - INTEGER
487 and CONFIG_NET_DMA is enabled. 487 and CONFIG_NET_DMA is enabled.
488 Default: 4096 488 Default: 4096
489 489
490tcp_thin_linear_timeouts - BOOLEAN
491 Enable dynamic triggering of linear timeouts for thin streams.
492 If set, a check is performed upon retransmission by timeout to
493 determine if the stream is thin (less than 4 packets in flight).
494 As long as the stream is found to be thin, up to 6 linear
495 timeouts may be performed before exponential backoff mode is
496 initiated. This improves retransmission latency for
497 non-aggressive thin streams, often found to be time-dependent.
498 For more information on thin streams, see
499 Documentation/networking/tcp-thin.txt
500 Default: 0
501
502tcp_thin_dupack - BOOLEAN
503 Enable dynamic triggering of retransmissions after one dupACK
504 for thin streams. If set, a check is performed upon reception
505 of a dupACK to determine if the stream is thin (less than 4
506 packets in flight). As long as the stream is found to be thin,
507 data is retransmitted on the first received dupACK. This
508 improves retransmission latency for non-aggressive thin
509 streams, often found to be time-dependent.
510 For more information on thin streams, see
511 Documentation/networking/tcp-thin.txt
512 Default: 0
513
490UDP variables: 514UDP variables:
491 515
492udp_mem - vector of 3 INTEGERs: min, pressure, max 516udp_mem - vector of 3 INTEGERs: min, pressure, max
@@ -564,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
564 (i.e. by default) range 1024-4999 is enough to issue up to 588 (i.e. by default) range 1024-4999 is enough to issue up to
565 2000 connections per second to systems supporting timestamps. 589 2000 connections per second to systems supporting timestamps.
566 590
591ip_local_reserved_ports - list of comma separated ranges
592 Specify the ports which are reserved for known third-party
593 applications. These ports will not be used by automatic port
594 assignments (e.g. when calling connect() or bind() with port
595 number 0). Explicit port allocation behavior is unchanged.
596
597 The format used for both input and output is a comma separated
598 list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
599 10). Writing to the file will clear all previously reserved
600 ports and update the current list with the one given in the
601 input.
602
603 Note that ip_local_port_range and ip_local_reserved_ports
604 settings are independent and both are considered by the kernel
605 when determining which ports are available for automatic port
606 assignments.
607
608 You can reserve ports which are not in the current
609 ip_local_port_range, e.g.:
610
611 $ cat /proc/sys/net/ipv4/ip_local_port_range
612 32000 61000
613 $ cat /proc/sys/net/ipv4/ip_local_reserved_ports
614 8080,9148
615
616 although this is redundant. However such a setting is useful
617 if later the port range is changed to a value that will
618 include the reserved ports.
619
620 Default: Empty
621
567ip_nonlocal_bind - BOOLEAN 622ip_nonlocal_bind - BOOLEAN
568 If set, allows processes to bind() to non-local IP addresses, 623 If set, allows processes to bind() to non-local IP addresses,
569 which can be quite useful - but may break some applications. 624 which can be quite useful - but may break some applications.
@@ -692,6 +747,25 @@ proxy_arp - BOOLEAN
692 conf/{all,interface}/proxy_arp is set to TRUE, 747 conf/{all,interface}/proxy_arp is set to TRUE,
693 it will be disabled otherwise 748 it will be disabled otherwise
694 749
750proxy_arp_pvlan - BOOLEAN
751 Private VLAN proxy arp.
752 Basically allow proxy arp replies back to the same interface
753 (from which the ARP request/solicitation was received).
754
755 This is done to support (ethernet) switch features, like RFC
756 3069, where the individual ports are NOT allowed to
757 communicate with each other, but they are allowed to talk to
758 the upstream router. As described in RFC 3069, it is possible
759 to allow these hosts to communicate through the upstream
760 router by proxy_arp'ing. Don't need to be used together with
761 proxy_arp.
762
763 This technology is known by different names:
764 In RFC 3069 it is called VLAN Aggregation.
765 Cisco and Allied Telesyn call it Private VLAN.
766 Hewlett-Packard call it Source-Port filtering or port-isolation.
767 Ericsson call it MAC-Forced Forwarding (RFC Draft).
768
695shared_media - BOOLEAN 769shared_media - BOOLEAN
696 Send(router) or accept(host) RFC1620 shared media redirects. 770 Send(router) or accept(host) RFC1620 shared media redirects.
697 Overrides ip_secure_redirects. 771 Overrides ip_secure_redirects.
@@ -833,9 +907,18 @@ arp_notify - BOOLEAN
833 or hardware address changes. 907 or hardware address changes.
834 908
835arp_accept - BOOLEAN 909arp_accept - BOOLEAN
836 Define behavior when gratuitous arp replies are received: 910 Define behavior for gratuitous ARP frames who's IP is not
837 0 - drop gratuitous arp frames 911 already present in the ARP table:
838 1 - accept gratuitous arp frames 912 0 - don't create new entries in the ARP table
913 1 - create new entries in the ARP table
914
915 Both replies and requests type gratuitous arp will trigger the
916 ARP table to be updated, if this setting is on.
917
918 If the ARP table already contains the IP address of the
919 gratuitous arp frame, the arp table will be updated regardless
920 if this setting is on or off.
921
839 922
840app_solicit - INTEGER 923app_solicit - INTEGER
841 The maximum number of probes to send to the user space ARP daemon 924 The maximum number of probes to send to the user space ARP daemon
diff --git a/Documentation/networking/ixgbevf.txt b/Documentation/networking/ixgbevf.txt
new file mode 100755
index 000000000000..19015de6725f
--- /dev/null
+++ b/Documentation/networking/ixgbevf.txt
@@ -0,0 +1,90 @@
1Linux* Base Driver for Intel(R) Network Connection
2==================================================
3
4November 24, 2009
5
6Contents
7========
8
9- In This Release
10- Identifying Your Adapter
11- Known Issues/Troubleshooting
12- Support
13
14In This Release
15===============
16
17This file describes the ixgbevf Linux* Base Driver for Intel Network
18Connection.
19
20The ixgbevf driver supports 82599-based virtual function devices that can only
21be activated on kernels with CONFIG_PCI_IOV enabled.
22
23The ixgbevf driver supports virtual functions generated by the ixgbe driver
24with a max_vfs value of 1 or greater.
25
26The guest OS loading the ixgbevf driver must support MSI-X interrupts.
27
28VLANs: There is a limit of a total of 32 shared VLANs to 1 or more VFs.
29
30Identifying Your Adapter
31========================
32
33For more information on how to identify your adapter, go to the Adapter &
34Driver ID Guide at:
35
36 http://support.intel.com/support/network/sb/CS-008441.htm
37
38Known Issues/Troubleshooting
39============================
40
41 Unloading Physical Function (PF) Driver Causes System Reboots When VM is
42 Running and VF is Loaded on the VM
43 ------------------------------------------------------------------------
44 Do not unload the PF driver (ixgbe) while VFs are assigned to guests.
45
46Support
47=======
48
49For general information, go to the Intel support website at:
50
51 http://support.intel.com
52
53or the Intel Wired Networking project hosted by Sourceforge at:
54
55 http://sourceforge.net/projects/e1000
56
57If an issue is identified with the released source code on the supported
58kernel with a supported adapter, email the specific information related
59to the issue to e1000-devel@lists.sf.net
60
61License
62=======
63
64Intel 10 Gigabit Linux driver.
65Copyright(c) 1999 - 2009 Intel Corporation.
66
67This program is free software; you can redistribute it and/or modify it
68under the terms and conditions of the GNU General Public License,
69version 2, as published by the Free Software Foundation.
70
71This program is distributed in the hope it will be useful, but WITHOUT
72ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
73FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
74more details.
75
76You should have received a copy of the GNU General Public License along with
77this program; if not, write to the Free Software Foundation, Inc.,
7851 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
79
80The full GNU General Public License is included in this distribution in
81the file called "COPYING".
82
83Trademarks
84==========
85
86Intel, Itanium, and Pentium are trademarks or registered trademarks of
87Intel Corporation or its subsidiaries in the United States and other
88countries.
89
90* Other names and brands may be claimed as the property of others.
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt
index 63214b280e00..e7bf3979facb 100644
--- a/Documentation/networking/l2tp.txt
+++ b/Documentation/networking/l2tp.txt
@@ -1,44 +1,95 @@
1This brief document describes how to use the kernel's PPPoL2TP driver 1This document describes how to use the kernel's L2TP drivers to
2to provide L2TP functionality. L2TP is a protocol that tunnels one or 2provide L2TP functionality. L2TP is a protocol that tunnels one or
3more PPP sessions over a UDP tunnel. It is commonly used for VPNs 3more sessions over an IP tunnel. It is commonly used for VPNs
4(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP 4(L2TP/IPSec) and by ISPs to tunnel subscriber PPP sessions over an IP
5network infrastructure. 5network infrastructure. With L2TPv3, it is also useful as a Layer-2
6tunneling infrastructure.
7
8Features
9========
10
11L2TPv2 (PPP over L2TP (UDP tunnels)).
12L2TPv3 ethernet pseudowires.
13L2TPv3 PPP pseudowires.
14L2TPv3 IP encapsulation.
15Netlink sockets for L2TPv3 configuration management.
16
17History
18=======
19
20The original pppol2tp driver was introduced in 2.6.23 and provided
21L2TPv2 functionality (rfc2661). L2TPv2 is used to tunnel one or more PPP
22sessions over a UDP tunnel.
23
24L2TPv3 (rfc3931) changes the protocol to allow different frame types
25to be passed over an L2TP tunnel by moving the PPP-specific parts of
26the protocol out of the core L2TP packet headers. Each frame type is
27known as a pseudowire type. Ethernet, PPP, HDLC, Frame Relay and ATM
28pseudowires for L2TP are defined in separate RFC standards. Another
29change for L2TPv3 is that it can be carried directly over IP with no
30UDP header (UDP is optional). It is also possible to create static
31unmanaged L2TPv3 tunnels manually without a control protocol
32(userspace daemon) to manage them.
33
34To support L2TPv3, the original pppol2tp driver was split up to
35separate the L2TP and PPP functionality. Existing L2TPv2 userspace
36apps should be unaffected as the original pppol2tp sockets API is
37retained. L2TPv3, however, uses netlink to manage L2TPv3 tunnels and
38sessions.
6 39
7Design 40Design
8====== 41======
9 42
10The PPPoL2TP driver, drivers/net/pppol2tp.c, provides a mechanism by 43The L2TP protocol separates control and data frames. The L2TP kernel
11which PPP frames carried through an L2TP session are passed through 44drivers handle only L2TP data frames; control frames are always
12the kernel's PPP subsystem. The standard PPP daemon, pppd, handles all 45handled by userspace. L2TP control frames carry messages between L2TP
13PPP interaction with the peer. PPP network interfaces are created for 46clients/servers and are used to setup / teardown tunnels and
14each local PPP endpoint. 47sessions. An L2TP client or server is implemented in userspace.
15 48
16The L2TP protocol http://www.faqs.org/rfcs/rfc2661.html defines L2TP 49Each L2TP tunnel is implemented using a UDP or L2TPIP socket; L2TPIP
17control and data frames. L2TP control frames carry messages between 50provides L2TPv3 IP encapsulation (no UDP) and is implemented using a
18L2TP clients/servers and are used to setup / teardown tunnels and 51new l2tpip socket family. The tunnel socket is typically created by
19sessions. An L2TP client or server is implemented in userspace and 52userspace, though for unmanaged L2TPv3 tunnels, the socket can also be
20will use a regular UDP socket per tunnel. L2TP data frames carry PPP 53created by the kernel. Each L2TP session (pseudowire) gets a network
21frames, which may be PPP control or PPP data. The kernel's PPP 54interface instance. In the case of PPP, these interfaces are created
55indirectly by pppd using a pppol2tp socket. In the case of ethernet,
56the netdevice is created upon a netlink request to create an L2TPv3
57ethernet pseudowire.
58
59For PPP, the PPPoL2TP driver, net/l2tp/l2tp_ppp.c, provides a
60mechanism by which PPP frames carried through an L2TP session are
61passed through the kernel's PPP subsystem. The standard PPP daemon,
62pppd, handles all PPP interaction with the peer. PPP network
63interfaces are created for each local PPP endpoint. The kernel's PPP
22subsystem arranges for PPP control frames to be delivered to pppd, 64subsystem arranges for PPP control frames to be delivered to pppd,
23while data frames are forwarded as usual. 65while data frames are forwarded as usual.
24 66
67For ethernet, the L2TPETH driver, net/l2tp/l2tp_eth.c, implements a
68netdevice driver, managing virtual ethernet devices, one per
69pseudowire. These interfaces can be managed using standard Linux tools
70such as "ip" and "ifconfig". If only IP frames are passed over the
71tunnel, the interface can be given an IP addresses of itself and its
72peer. If non-IP frames are to be passed over the tunnel, the interface
73can be added to a bridge using brctl. All L2TP datapath protocol
74functions are handled by the L2TP core driver.
75
25Each tunnel and session within a tunnel is assigned a unique tunnel_id 76Each tunnel and session within a tunnel is assigned a unique tunnel_id
26and session_id. These ids are carried in the L2TP header of every 77and session_id. These ids are carried in the L2TP header of every
27control and data packet. The pppol2tp driver uses them to lookup 78control and data packet. (Actually, in L2TPv3, the tunnel_id isn't
28internal tunnel and/or session contexts. Zero tunnel / session ids are 79present in data frames - it is inferred from the IP connection on
29treated specially - zero ids are never assigned to tunnels or sessions 80which the packet was received.) The L2TP driver uses the ids to lookup
30in the network. In the driver, the tunnel context keeps a pointer to 81internal tunnel and/or session contexts to determine how to handle the
31the tunnel UDP socket. The session context keeps a pointer to the 82packet. Zero tunnel / session ids are treated specially - zero ids are
32PPPoL2TP socket, as well as other data that lets the driver interface 83never assigned to tunnels or sessions in the network. In the driver,
33to the kernel PPP subsystem. 84the tunnel context keeps a reference to the tunnel UDP or L2TPIP
34 85socket. The session context holds data that lets the driver interface
35Note that the pppol2tp kernel driver handles only L2TP data frames; 86to the kernel's network frame type subsystems, i.e. PPP, ethernet.
36L2TP control frames are simply passed up to userspace in the UDP 87
37tunnel socket. The kernel handles all datapath aspects of the 88Userspace Programming
38protocol, including data packet resequencing (if enabled). 89=====================
39 90
40There are a number of requirements on the userspace L2TP daemon in 91For L2TPv2, there are a number of requirements on the userspace L2TP
41order to use the pppol2tp driver. 92daemon in order to use the pppol2tp driver.
42 93
431. Use a UDP socket per tunnel. 941. Use a UDP socket per tunnel.
44 95
@@ -86,6 +137,35 @@ In addition to the standard PPP ioctls, a PPPIOCGL2TPSTATS is provided
86to retrieve tunnel and session statistics from the kernel using the 137to retrieve tunnel and session statistics from the kernel using the
87PPPoX socket of the appropriate tunnel or session. 138PPPoX socket of the appropriate tunnel or session.
88 139
140For L2TPv3, userspace must use the netlink API defined in
141include/linux/l2tp.h to manage tunnel and session contexts. The
142general procedure to create a new L2TP tunnel with one session is:-
143
1441. Open a GENL socket using L2TP_GENL_NAME for configuring the kernel
145 using netlink.
146
1472. Create a UDP or L2TPIP socket for the tunnel.
148
1493. Create a new L2TP tunnel using a L2TP_CMD_TUNNEL_CREATE
150 request. Set attributes according to desired tunnel parameters,
151 referencing the UDP or L2TPIP socket created in the previous step.
152
1534. Create a new L2TP session in the tunnel using a
154 L2TP_CMD_SESSION_CREATE request.
155
156The tunnel and all of its sessions are closed when the tunnel socket
157is closed. The netlink API may also be used to delete sessions and
158tunnels. Configuration and status info may be set or read using netlink.
159
160The L2TP driver also supports static (unmanaged) L2TPv3 tunnels. These
161are where there is no L2TP control message exchange with the peer to
162setup the tunnel; the tunnel is configured manually at each end of the
163tunnel. There is no need for an L2TP userspace application in this
164case -- the tunnel socket is created by the kernel and configured
165using parameters sent in the L2TP_CMD_TUNNEL_CREATE netlink
166request. The "ip" utility of iproute2 has commands for managing static
167L2TPv3 tunnels; do "ip l2tp help" for more information.
168
89Debugging 169Debugging
90========= 170=========
91 171
@@ -102,6 +182,69 @@ PPPOL2TP_MSG_CONTROL userspace - kernel interface
102PPPOL2TP_MSG_SEQ sequence numbers handling 182PPPOL2TP_MSG_SEQ sequence numbers handling
103PPPOL2TP_MSG_DATA data packets 183PPPOL2TP_MSG_DATA data packets
104 184
185If enabled, files under a l2tp debugfs directory can be used to dump
186kernel state about L2TP tunnels and sessions. To access it, the
187debugfs filesystem must first be mounted.
188
189# mount -t debugfs debugfs /debug
190
191Files under the l2tp directory can then be accessed.
192
193# cat /debug/l2tp/tunnels
194
195The debugfs files should not be used by applications to obtain L2TP
196state information because the file format is subject to change. It is
197implemented to provide extra debug information to help diagnose
198problems.) Users should use the netlink API.
199
200/proc/net/pppol2tp is also provided for backwards compaibility with
201the original pppol2tp driver. It lists information about L2TPv2
202tunnels and sessions only. Its use is discouraged.
203
204Unmanaged L2TPv3 Tunnels
205========================
206
207Some commercial L2TP products support unmanaged L2TPv3 ethernet
208tunnels, where there is no L2TP control protocol; tunnels are
209configured at each side manually. New commands are available in
210iproute2's ip utility to support this.
211
212To create an L2TPv3 ethernet pseudowire between local host 192.168.1.1
213and peer 192.168.1.2, using IP addresses 10.5.1.1 and 10.5.1.2 for the
214tunnel endpoints:-
215
216# modprobe l2tp_eth
217# modprobe l2tp_netlink
218
219# ip l2tp add tunnel tunnel_id 1 peer_tunnel_id 1 udp_sport 5000 \
220 udp_dport 5000 encap udp local 192.168.1.1 remote 192.168.1.2
221# ip l2tp add session tunnel_id 1 session_id 1 peer_session_id 1
222# ifconfig -a
223# ip addr add 10.5.1.2/32 peer 10.5.1.1/32 dev l2tpeth0
224# ifconfig l2tpeth0 up
225
226Choose IP addresses to be the address of a local IP interface and that
227of the remote system. The IP addresses of the l2tpeth0 interface can be
228anything suitable.
229
230Repeat the above at the peer, with ports, tunnel/session ids and IP
231addresses reversed. The tunnel and session IDs can be any non-zero
23232-bit number, but the values must be reversed at the peer.
233
234Host 1 Host2
235udp_sport=5000 udp_sport=5001
236udp_dport=5001 udp_dport=5000
237tunnel_id=42 tunnel_id=45
238peer_tunnel_id=45 peer_tunnel_id=42
239session_id=128 session_id=5196755
240peer_session_id=5196755 peer_session_id=128
241
242When done at both ends of the tunnel, it should be possible to send
243data over the network. e.g.
244
245# ping 10.5.1.1
246
247
105Sample Userspace Code 248Sample Userspace Code
106===================== 249=====================
107 250
@@ -158,12 +301,48 @@ Sample Userspace Code
158 } 301 }
159 return 0; 302 return 0;
160 303
304Internal Implementation
305=======================
306
307The driver keeps a struct l2tp_tunnel context per L2TP tunnel and a
308struct l2tp_session context for each session. The l2tp_tunnel is
309always associated with a UDP or L2TP/IP socket and keeps a list of
310sessions in the tunnel. The l2tp_session context keeps kernel state
311about the session. It has private data which is used for data specific
312to the session type. With L2TPv2, the session always carried PPP
313traffic. With L2TPv3, the session can also carry ethernet frames
314(ethernet pseudowire) or other data types such as ATM, HDLC or Frame
315Relay.
316
317When a tunnel is first opened, the reference count on the socket is
318increased using sock_hold(). This ensures that the kernel socket
319cannot be removed while L2TP's data structures reference it.
320
321Some L2TP sessions also have a socket (PPP pseudowires) while others
322do not (ethernet pseudowires). We can't use the socket reference count
323as the reference count for session contexts. The L2TP implementation
324therefore has its own internal reference counts on the session
325contexts.
326
327To Do
328=====
329
330Add L2TP tunnel switching support. This would route tunneled traffic
331from one L2TP tunnel into another. Specified in
332http://tools.ietf.org/html/draft-ietf-l2tpext-tunnel-switching-08
333
334Add L2TPv3 VLAN pseudowire support.
335
336Add L2TPv3 IP pseudowire support.
337
338Add L2TPv3 ATM pseudowire support.
339
161Miscellaneous 340Miscellaneous
162============ 341=============
163 342
164The PPPoL2TP driver was developed as part of the OpenL2TP project by 343The L2TP drivers were developed as part of the OpenL2TP project by
165Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server, 344Katalix Systems Ltd. OpenL2TP is a full-featured L2TP client / server,
166designed from the ground up to have the L2TP datapath in the 345designed from the ground up to have the L2TP datapath in the
167kernel. The project also implemented the pppol2tp plugin for pppd 346kernel. The project also implemented the pppol2tp plugin for pppd
168which allows pppd to use the kernel driver. Details can be found at 347which allows pppd to use the kernel driver. Details can be found at
169http://openl2tp.sourceforge.net. 348http://www.openl2tp.org.
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index a22fd85e3796..98f71a5cef00 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -2,7 +2,7 @@
2+ ABSTRACT 2+ ABSTRACT
3-------------------------------------------------------------------------------- 3--------------------------------------------------------------------------------
4 4
5This file documents the CONFIG_PACKET_MMAP option available with the PACKET 5This file documents the mmap() facility available with the PACKET
6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
7capture network traffic with utilities like tcpdump or any other that needs 7capture network traffic with utilities like tcpdump or any other that needs
8raw access to network interface. 8raw access to network interface.
@@ -44,7 +44,7 @@ enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
44supported by devices of your network. 44supported by devices of your network.
45 45
46-------------------------------------------------------------------------------- 46--------------------------------------------------------------------------------
47+ How to use CONFIG_PACKET_MMAP to improve capture process 47+ How to use mmap() to improve capture process
48-------------------------------------------------------------------------------- 48--------------------------------------------------------------------------------
49 49
50From the user standpoint, you should use the higher level libpcap library, which 50From the user standpoint, you should use the higher level libpcap library, which
@@ -64,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
64support. 64support.
65 65
66-------------------------------------------------------------------------------- 66--------------------------------------------------------------------------------
67+ How to use CONFIG_PACKET_MMAP directly to improve capture process 67+ How to use mmap() directly to improve capture process
68-------------------------------------------------------------------------------- 68--------------------------------------------------------------------------------
69 69
70From the system calls stand point, the use of PACKET_MMAP involves 70From the system calls stand point, the use of PACKET_MMAP involves
@@ -100,12 +100,12 @@ by the kernel.
100The destruction of the socket and all associated resources 100The destruction of the socket and all associated resources
101is done by a simple call to close(fd). 101is done by a simple call to close(fd).
102 102
103Next I will describe PACKET_MMAP settings and it's constraints, 103Next I will describe PACKET_MMAP settings and its constraints,
104also the mapping of the circular buffer in the user process and 104also the mapping of the circular buffer in the user process and
105the use of this buffer. 105the use of this buffer.
106 106
107-------------------------------------------------------------------------------- 107--------------------------------------------------------------------------------
108+ How to use CONFIG_PACKET_MMAP directly to improve transmission process 108+ How to use mmap() directly to improve transmission process
109-------------------------------------------------------------------------------- 109--------------------------------------------------------------------------------
110Transmission process is similar to capture as shown below. 110Transmission process is similar to capture as shown below.
111 111
@@ -432,7 +432,7 @@ TP_STATUS_LOSING : indicates there were packet drops from last time
432 the PACKET_STATISTICS option. 432 the PACKET_STATISTICS option.
433 433
434TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which 434TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which
435 it's checksum will be done in hardware. So while 435 its checksum will be done in hardware. So while
436 reading the packet we should not try to check the 436 reading the packet we should not try to check the
437 checksum. 437 checksum.
438 438
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index ee31369e9e5b..9551622d0a7b 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -188,3 +188,27 @@ Then in some part of your code after your wiphy has been registered:
188 &mydriver_jp_regdom.reg_rules[i], 188 &mydriver_jp_regdom.reg_rules[i],
189 sizeof(struct ieee80211_reg_rule)); 189 sizeof(struct ieee80211_reg_rule));
190 regulatory_struct_hint(rd); 190 regulatory_struct_hint(rd);
191
192Statically compiled regulatory database
193---------------------------------------
194
195In most situations the userland solution using CRDA as described
196above is the preferred solution. However in some cases a set of
197rules built into the kernel itself may be desirable. To account
198for this situation, a configuration option has been provided
199(i.e. CONFIG_CFG80211_INTERNAL_REGDB). With this option enabled,
200the wireless database information contained in net/wireless/db.txt is
201used to generate a data structure encoded in net/wireless/regdb.c.
202That option also enables code in net/wireless/reg.c which queries
203the data in regdb.c as an alternative to using CRDA.
204
205The file net/wireless/db.txt should be kept up-to-date with the db.txt
206file available in the git repository here:
207
208 git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git
209
210Again, most users in most situations should be using the CRDA package
211provided with their distribution, and in most other situations users
212should be building and using CRDA on their own rather than using
213this option. If you are not absolutely sure that you should be using
214CONFIG_CFG80211_INTERNAL_REGDB then _DO_NOT_USE_IT_.
diff --git a/Documentation/networking/skfp.txt b/Documentation/networking/skfp.txt
index abfddf81e34a..203ec66c9fb4 100644
--- a/Documentation/networking/skfp.txt
+++ b/Documentation/networking/skfp.txt
@@ -68,7 +68,7 @@ Compaq adapters (not tested):
68======================= 68=======================
69 69
70From v2.01 on, the driver is integrated in the linux kernel sources. 70From v2.01 on, the driver is integrated in the linux kernel sources.
71Therefor, the installation is the same as for any other adapter 71Therefore, the installation is the same as for any other adapter
72supported by the kernel. 72supported by the kernel.
73Refer to the manual of your distribution about the installation 73Refer to the manual of your distribution about the installation
74of network adapters. 74of network adapters.
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
new file mode 100644
index 000000000000..7ee770b5ef5f
--- /dev/null
+++ b/Documentation/networking/stmmac.txt
@@ -0,0 +1,143 @@
1 STMicroelectronics 10/100/1000 Synopsys Ethernet driver
2
3Copyright (C) 2007-2010 STMicroelectronics Ltd
4Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
5
6This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers
7(Synopsys IP blocks); it has been fully tested on STLinux platforms.
8
9Currently this network device driver is for all STM embedded MAC/GMAC
10(7xxx SoCs).
11
12DWC Ether MAC 10/100/1000 Universal version 3.41a and DWC Ether MAC 10/100
13Universal version 4.0 have been used for developing the first code
14implementation.
15
16Please, for more information also visit: www.stlinux.com
17
181) Kernel Configuration
19The kernel configuration option is STMMAC_ETH:
20 Device Drivers ---> Network device support ---> Ethernet (1000 Mbit) --->
21 STMicroelectronics 10/100/1000 Ethernet driver (STMMAC_ETH)
22
232) Driver parameters list:
24 debug: message level (0: no output, 16: all);
25 phyaddr: to manually provide the physical address to the PHY device;
26 dma_rxsize: DMA rx ring size;
27 dma_txsize: DMA tx ring size;
28 buf_sz: DMA buffer size;
29 tc: control the HW FIFO threshold;
30 tx_coe: Enable/Disable Tx Checksum Offload engine;
31 watchdog: transmit timeout (in milliseconds);
32 flow_ctrl: Flow control ability [on/off];
33 pause: Flow Control Pause Time;
34 tmrate: timer period (only if timer optimisation is configured).
35
363) Command line options
37Driver parameters can be also passed in command line by using:
38 stmmaceth=dma_rxsize:128,dma_txsize:512
39
404) Driver information and notes
41
424.1) Transmit process
43The xmit method is invoked when the kernel needs to transmit a packet; it sets
44the descriptors in the ring and informs the DMA engine that there is a packet
45ready to be transmitted.
46Once the controller has finished transmitting the packet, an interrupt is
47triggered; So the driver will be able to release the socket buffers.
48By default, the driver sets the NETIF_F_SG bit in the features field of the
49net_device structure enabling the scatter/gather feature.
50
514.2) Receive process
52When one or more packets are received, an interrupt happens. The interrupts
53are not queued so the driver has to scan all the descriptors in the ring during
54the receive process.
55This is based on NAPI so the interrupt handler signals only if there is work to be
56done, and it exits.
57Then the poll method will be scheduled at some future point.
58The incoming packets are stored, by the DMA, in a list of pre-allocated socket
59buffers in order to avoid the memcpy (Zero-copy).
60
614.3) Timer-Driver Interrupt
62Instead of having the device that asynchronously notifies the frame receptions, the
63driver configures a timer to generate an interrupt at regular intervals.
64Based on the granularity of the timer, the frames that are received by the device
65will experience different levels of latency. Some NICs have dedicated timer
66device to perform this task. STMMAC can use either the RTC device or the TMU
67channel 2 on STLinux platforms.
68The timers frequency can be passed to the driver as parameter; when change it,
69take care of both hardware capability and network stability/performance impact.
70Several performance tests on STM platforms showed this optimisation allows to spare
71the CPU while having the maximum throughput.
72
734.4) WOL
74Wake up on Lan feature through Magic Frame is only supported for the GMAC
75core.
76
774.5) DMA descriptors
78Driver handles both normal and enhanced descriptors. The latter has been only
79tested on DWC Ether MAC 10/100/1000 Universal version 3.41a.
80
814.6) Ethtool support
82Ethtool is supported. Driver statistics and internal errors can be taken using:
83ethtool -S ethX command. It is possible to dump registers etc.
84
854.7) Jumbo and Segmentation Offloading
86Jumbo frames are supported and tested for the GMAC.
87The GSO has been also added but it's performed in software.
88LRO is not supported.
89
904.8) Physical
91The driver is compatible with PAL to work with PHY and GPHY devices.
92
934.9) Platform information
94Several information came from the platform; please refer to the
95driver's Header file in include/linux directory.
96
97struct plat_stmmacenet_data {
98 int bus_id;
99 int pbl;
100 int has_gmac;
101 void (*fix_mac_speed)(void *priv, unsigned int speed);
102 void (*bus_setup)(unsigned long ioaddr);
103#ifdef CONFIG_STM_DRIVERS
104 struct stm_pad_config *pad_config;
105#endif
106 void *bsp_priv;
107};
108
109Where:
110- pbl (Programmable Burst Length) is maximum number of
111 beats to be transferred in one DMA transaction.
112 GMAC also enables the 4xPBL by default.
113- fix_mac_speed and bus_setup are used to configure internal target
114 registers (on STM platforms);
115- has_gmac: GMAC core is on board (get it at run-time in the next step);
116- bus_id: bus identifier.
117
118struct plat_stmmacphy_data {
119 int bus_id;
120 int phy_addr;
121 unsigned int phy_mask;
122 int interface;
123 int (*phy_reset)(void *priv);
124 void *priv;
125};
126
127Where:
128- bus_id: bus identifier;
129- phy_addr: physical address used for the attached phy device;
130 set it to -1 to get it at run-time;
131- interface: physical MII interface mode;
132- phy_reset: hook to reset HW function.
133
134TODO:
135- Continue to make the driver more generic and suitable for other Synopsys
136 Ethernet controllers used on other architectures (i.e. ARM).
137- 10G controllers are not supported.
138- MAC uses Normal descriptors and GMAC uses enhanced ones.
139 This is a limit that should be reviewed. MAC could want to
140 use the enhanced structure.
141- Checksumming: Rx/Tx csum is done in HW in case of GMAC only.
142- Review the timer optimisation code to use an embedded device that seems to be
143 available in new chip generations.
diff --git a/Documentation/networking/tcp-thin.txt b/Documentation/networking/tcp-thin.txt
new file mode 100644
index 000000000000..151e229980f1
--- /dev/null
+++ b/Documentation/networking/tcp-thin.txt
@@ -0,0 +1,47 @@
1Thin-streams and TCP
2====================
3A wide range of Internet-based services that use reliable transport
4protocols display what we call thin-stream properties. This means
5that the application sends data with such a low rate that the
6retransmission mechanisms of the transport protocol are not fully
7effective. In time-dependent scenarios (like online games, control
8systems, stock trading etc.) where the user experience depends
9on the data delivery latency, packet loss can be devastating for
10the service quality. Extreme latencies are caused by TCP's
11dependency on the arrival of new data from the application to trigger
12retransmissions effectively through fast retransmit instead of
13waiting for long timeouts.
14
15After analysing a large number of time-dependent interactive
16applications, we have seen that they often produce thin streams
17and also stay with this traffic pattern throughout its entire
18lifespan. The combination of time-dependency and the fact that the
19streams provoke high latencies when using TCP is unfortunate.
20
21In order to reduce application-layer latency when packets are lost,
22a set of mechanisms has been made, which address these latency issues
23for thin streams. In short, if the kernel detects a thin stream,
24the retransmission mechanisms are modified in the following manner:
25
261) If the stream is thin, fast retransmit on the first dupACK.
272) If the stream is thin, do not apply exponential backoff.
28
29These enhancements are applied only if the stream is detected as
30thin. This is accomplished by defining a threshold for the number
31of packets in flight. If there are less than 4 packets in flight,
32fast retransmissions can not be triggered, and the stream is prone
33to experience high retransmission latencies.
34
35Since these mechanisms are targeted at time-dependent applications,
36they must be specifically activated by the application using the
37TCP_THIN_LINEAR_TIMEOUTS and TCP_THIN_DUPACK IOCTLS or the
38tcp_thin_linear_timeouts and tcp_thin_dupack sysctls. Both
39modifications are turned off by default.
40
41References
42==========
43More information on the modifications, as well as a wide range of
44experimental data can be found here:
45"Improving latency for interactive, thin-stream applications over
46reliable transport"
47http://simula.no/research/nd/publications/Simula.nd.477/simula_pdf_file
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 0e58b4539176..e8c8f4f06c67 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -41,11 +41,12 @@ SOF_TIMESTAMPING_SOFTWARE: return system time stamp generated in
41SOF_TIMESTAMPING_TX/RX determine how time stamps are generated. 41SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
42SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the 42SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
43following control message: 43following control message:
44 struct scm_timestamping { 44
45 struct timespec systime; 45struct scm_timestamping {
46 struct timespec hwtimetrans; 46 struct timespec systime;
47 struct timespec hwtimeraw; 47 struct timespec hwtimetrans;
48 }; 48 struct timespec hwtimeraw;
49};
49 50
50recvmsg() can be used to get this control message for regular incoming 51recvmsg() can be used to get this control message for regular incoming
51packets. For send time stamps the outgoing packet is looped back to 52packets. For send time stamps the outgoing packet is looped back to
@@ -87,12 +88,13 @@ by the network device and will be empty without that support.
87SIOCSHWTSTAMP: 88SIOCSHWTSTAMP:
88 89
89Hardware time stamping must also be initialized for each device driver 90Hardware time stamping must also be initialized for each device driver
90that is expected to do hardware time stamping. The parameter is: 91that is expected to do hardware time stamping. The parameter is defined in
92/include/linux/net_tstamp.h as:
91 93
92struct hwtstamp_config { 94struct hwtstamp_config {
93 int flags; /* no flags defined right now, must be zero */ 95 int flags; /* no flags defined right now, must be zero */
94 int tx_type; /* HWTSTAMP_TX_* */ 96 int tx_type; /* HWTSTAMP_TX_* */
95 int rx_filter; /* HWTSTAMP_FILTER_* */ 97 int rx_filter; /* HWTSTAMP_FILTER_* */
96}; 98};
97 99
98Desired behavior is passed into the kernel and to a specific device by 100Desired behavior is passed into the kernel and to a specific device by
@@ -139,42 +141,56 @@ enum {
139 /* time stamp any incoming packet */ 141 /* time stamp any incoming packet */
140 HWTSTAMP_FILTER_ALL, 142 HWTSTAMP_FILTER_ALL,
141 143
142 /* return value: time stamp all packets requested plus some others */ 144 /* return value: time stamp all packets requested plus some others */
143 HWTSTAMP_FILTER_SOME, 145 HWTSTAMP_FILTER_SOME,
144 146
145 /* PTP v1, UDP, any kind of event packet */ 147 /* PTP v1, UDP, any kind of event packet */
146 HWTSTAMP_FILTER_PTP_V1_L4_EVENT, 148 HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
147 149
148 ... 150 /* for the complete list of values, please check
151 * the include file /include/linux/net_tstamp.h
152 */
149}; 153};
150 154
151 155
152DEVICE IMPLEMENTATION 156DEVICE IMPLEMENTATION
153 157
154A driver which supports hardware time stamping must support the 158A driver which supports hardware time stamping must support the
155SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored 159SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with
156in the skb with skb_hwtstamp_set(). 160the actual values as described in the section on SIOCSHWTSTAMP.
161
162Time stamps for received packets must be stored in the skb. To get a pointer
163to the shared time stamp structure of the skb call skb_hwtstamps(). Then
164set the time stamps in the structure:
165
166struct skb_shared_hwtstamps {
167 /* hardware time stamp transformed into duration
168 * since arbitrary point in time
169 */
170 ktime_t hwtstamp;
171 ktime_t syststamp; /* hwtstamp transformed to system time base */
172};
157 173
158Time stamps for outgoing packets are to be generated as follows: 174Time stamps for outgoing packets are to be generated as follows:
159- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware() 175- In hard_start_xmit(), check if skb_tx(skb)->hardware is set no-zero.
160 returns non-zero. If yes, then the driver is expected 176 If yes, then the driver is expected to do hardware time stamping.
161 to do hardware time stamping.
162- If this is possible for the skb and requested, then declare 177- If this is possible for the skb and requested, then declare
163 that the driver is doing the time stamping by calling 178 that the driver is doing the time stamping by setting the field
164 skb_hwtstamp_tx_in_progress(). A driver not supporting 179 skb_tx(skb)->in_progress non-zero. You might want to keep a pointer
165 hardware time stamping doesn't do that. A driver must never 180 to the associated skb for the next step and not free the skb. A driver
166 touch sk_buff::tstamp! It is used to store how time stamping 181 not supporting hardware time stamping doesn't do that. A driver must
167 for an outgoing packets is to be done. 182 never touch sk_buff::tstamp! It is used to store software generated
183 time stamps by the network subsystem.
168- As soon as the driver has sent the packet and/or obtained a 184- As soon as the driver has sent the packet and/or obtained a
169 hardware time stamp for it, it passes the time stamp back by 185 hardware time stamp for it, it passes the time stamp back by
170 calling skb_hwtstamp_tx() with the original skb, the raw 186 calling skb_hwtstamp_tx() with the original skb, the raw
171 hardware time stamp and a handle to the device (necessary 187 hardware time stamp. skb_hwtstamp_tx() clones the original skb and
172 to convert the hardware time stamp to system time). If obtaining 188 adds the timestamps, therefore the original skb has to be freed now.
173 the hardware time stamp somehow fails, then the driver should 189 If obtaining the hardware time stamp somehow fails, then the driver
174 not fall back to software time stamping. The rationale is that 190 should not fall back to software time stamping. The rationale is that
175 this would occur at a later time in the processing pipeline 191 this would occur at a later time in the processing pipeline than other
176 than other software time stamping and therefore could lead 192 software time stamping and therefore could lead to unexpected deltas
177 to unexpected deltas between time stamps. 193 between time stamps.
178- If the driver did not call skb_hwtstamp_tx_in_progress(), then 194- If the driver did not call set skb_tx(skb)->in_progress, then
179 dev_hard_start_xmit() checks whether software time stamping 195 dev_hard_start_xmit() checks whether software time stamping
180 is wanted as fallback and potentially generates the time stamp. 196 is wanted as fallback and potentially generates the time stamp.
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
index 2a1489fdc036..e79973443e9f 100644
--- a/Documentation/networking/timestamping/Makefile
+++ b/Documentation/networking/timestamping/Makefile
@@ -1,6 +1,13 @@
1CPPFLAGS = -I../../../include 1# kbuild trick to avoid linker error. Can be omitted if a module is built.
2obj- := dummy.o
2 3
3timestamping: timestamping.c 4# List of programs to build
5hostprogs-y := timestamping
6
7# Tell kbuild to always build the programs
8always := $(hostprogs-y)
9
10HOSTCFLAGS_timestamping.o += -I$(objtree)/usr/include
4 11
5clean: 12clean:
6 rm -f timestamping 13 rm -f timestamping
diff --git a/Documentation/networking/timestamping/timestamping.c b/Documentation/networking/timestamping/timestamping.c
index a7936fe8444a..8ba82bfe6a33 100644
--- a/Documentation/networking/timestamping/timestamping.c
+++ b/Documentation/networking/timestamping/timestamping.c
@@ -41,9 +41,9 @@
41#include <arpa/inet.h> 41#include <arpa/inet.h>
42#include <net/if.h> 42#include <net/if.h>
43 43
44#include "asm/types.h" 44#include <asm/types.h>
45#include "linux/net_tstamp.h" 45#include <linux/net_tstamp.h>
46#include "linux/errqueue.h" 46#include <linux/errqueue.h>
47 47
48#ifndef SO_TIMESTAMPING 48#ifndef SO_TIMESTAMPING
49# define SO_TIMESTAMPING 37 49# define SO_TIMESTAMPING 37
@@ -164,7 +164,7 @@ static void printpacket(struct msghdr *msg, int res,
164 164
165 gettimeofday(&now, 0); 165 gettimeofday(&now, 0);
166 166
167 printf("%ld.%06ld: received %s data, %d bytes from %s, %d bytes control messages\n", 167 printf("%ld.%06ld: received %s data, %d bytes from %s, %zu bytes control messages\n",
168 (long)now.tv_sec, (long)now.tv_usec, 168 (long)now.tv_sec, (long)now.tv_usec,
169 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular", 169 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
170 res, 170 res,
@@ -173,7 +173,7 @@ static void printpacket(struct msghdr *msg, int res,
173 for (cmsg = CMSG_FIRSTHDR(msg); 173 for (cmsg = CMSG_FIRSTHDR(msg);
174 cmsg; 174 cmsg;
175 cmsg = CMSG_NXTHDR(msg, cmsg)) { 175 cmsg = CMSG_NXTHDR(msg, cmsg)) {
176 printf(" cmsg len %d: ", cmsg->cmsg_len); 176 printf(" cmsg len %zu: ", cmsg->cmsg_len);
177 switch (cmsg->cmsg_level) { 177 switch (cmsg->cmsg_level) {
178 case SOL_SOCKET: 178 case SOL_SOCKET:
179 printf("SOL_SOCKET "); 179 printf("SOL_SOCKET ");
@@ -370,7 +370,7 @@ int main(int argc, char **argv)
370 } 370 }
371 371
372 sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); 372 sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
373 if (socket < 0) 373 if (sock < 0)
374 bail("socket"); 374 bail("socket");
375 375
376 memset(&device, 0, sizeof(device)); 376 memset(&device, 0, sizeof(device));
diff --git a/Documentation/networking/x25-iface.txt b/Documentation/networking/x25-iface.txt
index 975cc87ebdd1..78f662ee0622 100644
--- a/Documentation/networking/x25-iface.txt
+++ b/Documentation/networking/x25-iface.txt
@@ -20,23 +20,23 @@ the rest of the skbuff, if any more information does exist.
20Packet Layer to Device Driver 20Packet Layer to Device Driver
21----------------------------- 21-----------------------------
22 22
23First Byte = 0x00 23First Byte = 0x00 (X25_IFACE_DATA)
24 24
25This indicates that the rest of the skbuff contains data to be transmitted 25This indicates that the rest of the skbuff contains data to be transmitted
26over the LAPB link. The LAPB link should already exist before any data is 26over the LAPB link. The LAPB link should already exist before any data is
27passed down. 27passed down.
28 28
29First Byte = 0x01 29First Byte = 0x01 (X25_IFACE_CONNECT)
30 30
31Establish the LAPB link. If the link is already established then the connect 31Establish the LAPB link. If the link is already established then the connect
32confirmation message should be returned as soon as possible. 32confirmation message should be returned as soon as possible.
33 33
34First Byte = 0x02 34First Byte = 0x02 (X25_IFACE_DISCONNECT)
35 35
36Terminate the LAPB link. If it is already disconnected then the disconnect 36Terminate the LAPB link. If it is already disconnected then the disconnect
37confirmation message should be returned as soon as possible. 37confirmation message should be returned as soon as possible.
38 38
39First Byte = 0x03 39First Byte = 0x03 (X25_IFACE_PARAMS)
40 40
41LAPB parameters. To be defined. 41LAPB parameters. To be defined.
42 42
@@ -44,22 +44,22 @@ LAPB parameters. To be defined.
44Device Driver to Packet Layer 44Device Driver to Packet Layer
45----------------------------- 45-----------------------------
46 46
47First Byte = 0x00 47First Byte = 0x00 (X25_IFACE_DATA)
48 48
49This indicates that the rest of the skbuff contains data that has been 49This indicates that the rest of the skbuff contains data that has been
50received over the LAPB link. 50received over the LAPB link.
51 51
52First Byte = 0x01 52First Byte = 0x01 (X25_IFACE_CONNECT)
53 53
54LAPB link has been established. The same message is used for both a LAPB 54LAPB link has been established. The same message is used for both a LAPB
55link connect_confirmation and a connect_indication. 55link connect_confirmation and a connect_indication.
56 56
57First Byte = 0x02 57First Byte = 0x02 (X25_IFACE_DISCONNECT)
58 58
59LAPB link has been terminated. This same message is used for both a LAPB 59LAPB link has been terminated. This same message is used for both a LAPB
60link disconnect_confirmation and a disconnect_indication. 60link disconnect_confirmation and a disconnect_indication.
61 61
62First Byte = 0x03 62First Byte = 0x03 (X25_IFACE_PARAMS)
63 63
64LAPB parameters. To be defined. 64LAPB parameters. To be defined.
65 65
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index c10c022b911c..6fe9001b9263 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -256,9 +256,13 @@ characters, each representing a particular tainted value.
256 9: 'A' if the ACPI table has been overridden. 256 9: 'A' if the ACPI table has been overridden.
257 257
258 10: 'W' if a warning has previously been issued by the kernel. 258 10: 'W' if a warning has previously been issued by the kernel.
259 (Though some warnings may set more specific taint flags.)
259 260
260 11: 'C' if a staging driver has been loaded. 261 11: 'C' if a staging driver has been loaded.
261 262
263 12: 'I' if the kernel is working around a severe bug in the platform
264 firmware (BIOS or similar).
265
262The primary reason for the 'Tainted: ' string is to tell kernel 266The primary reason for the 'Tainted: ' string is to tell kernel
263debuggers if this is a clean kernel or if anything unusual has 267debuggers if this is a clean kernel or if anything unusual has
264occurred. Tainting is permanent: even if an offending module is 268occurred. Tainting is permanent: even if an offending module is
diff --git a/Documentation/padata.txt b/Documentation/padata.txt
new file mode 100644
index 000000000000..269d7d0d8335
--- /dev/null
+++ b/Documentation/padata.txt
@@ -0,0 +1,107 @@
1The padata parallel execution mechanism
2Last updated for 2.6.34
3
4Padata is a mechanism by which the kernel can farm work out to be done in
5parallel on multiple CPUs while retaining the ordering of tasks. It was
6developed for use with the IPsec code, which needs to be able to perform
7encryption and decryption on large numbers of packets without reordering
8those packets. The crypto developers made a point of writing padata in a
9sufficiently general fashion that it could be put to other uses as well.
10
11The first step in using padata is to set up a padata_instance structure for
12overall control of how tasks are to be run:
13
14 #include <linux/padata.h>
15
16 struct padata_instance *padata_alloc(const struct cpumask *cpumask,
17 struct workqueue_struct *wq);
18
19The cpumask describes which processors will be used to execute work
20submitted to this instance. The workqueue wq is where the work will
21actually be done; it should be a multithreaded queue, naturally.
22
23There are functions for enabling and disabling the instance:
24
25 void padata_start(struct padata_instance *pinst);
26 void padata_stop(struct padata_instance *pinst);
27
28These functions literally do nothing beyond setting or clearing the
29"padata_start() was called" flag; if that flag is not set, other functions
30will refuse to work.
31
32The list of CPUs to be used can be adjusted with these functions:
33
34 int padata_set_cpumask(struct padata_instance *pinst,
35 cpumask_var_t cpumask);
36 int padata_add_cpu(struct padata_instance *pinst, int cpu);
37 int padata_remove_cpu(struct padata_instance *pinst, int cpu);
38
39Changing the CPU mask has the look of an expensive operation, though, so it
40probably should not be done with great frequency.
41
42Actually submitting work to the padata instance requires the creation of a
43padata_priv structure:
44
45 struct padata_priv {
46 /* Other stuff here... */
47 void (*parallel)(struct padata_priv *padata);
48 void (*serial)(struct padata_priv *padata);
49 };
50
51This structure will almost certainly be embedded within some larger
52structure specific to the work to be done. Most its fields are private to
53padata, but the structure should be zeroed at initialization time, and the
54parallel() and serial() functions should be provided. Those functions will
55be called in the process of getting the work done as we will see
56momentarily.
57
58The submission of work is done with:
59
60 int padata_do_parallel(struct padata_instance *pinst,
61 struct padata_priv *padata, int cb_cpu);
62
63The pinst and padata structures must be set up as described above; cb_cpu
64specifies which CPU will be used for the final callback when the work is
65done; it must be in the current instance's CPU mask. The return value from
66padata_do_parallel() is a little strange; zero is an error return
67indicating that the caller forgot the padata_start() formalities. -EBUSY
68means that somebody, somewhere else is messing with the instance's CPU
69mask, while -EINVAL is a complaint about cb_cpu not being in that CPU mask.
70If all goes well, this function will return -EINPROGRESS, indicating that
71the work is in progress.
72
73Each task submitted to padata_do_parallel() will, in turn, be passed to
74exactly one call to the above-mentioned parallel() function, on one CPU, so
75true parallelism is achieved by submitting multiple tasks. Despite the
76fact that the workqueue is used to make these calls, parallel() is run with
77software interrupts disabled and thus cannot sleep. The parallel()
78function gets the padata_priv structure pointer as its lone parameter;
79information about the actual work to be done is probably obtained by using
80container_of() to find the enclosing structure.
81
82Note that parallel() has no return value; the padata subsystem assumes that
83parallel() will take responsibility for the task from this point. The work
84need not be completed during this call, but, if parallel() leaves work
85outstanding, it should be prepared to be called again with a new job before
86the previous one completes. When a task does complete, parallel() (or
87whatever function actually finishes the job) should inform padata of the
88fact with a call to:
89
90 void padata_do_serial(struct padata_priv *padata);
91
92At some point in the future, padata_do_serial() will trigger a call to the
93serial() function in the padata_priv structure. That call will happen on
94the CPU requested in the initial call to padata_do_parallel(); it, too, is
95done through the workqueue, but with local software interrupts disabled.
96Note that this call may be deferred for a while since the padata code takes
97pains to ensure that tasks are completed in the order in which they were
98submitted.
99
100The one remaining function in the padata API should be called to clean up
101when a padata instance is no longer needed:
102
103 void padata_free(struct padata_instance *pinst);
104
105This function will busy-wait while any remaining tasks are completed, so it
106might be best not to call it while there is work outstanding. Shutting
107down the workqueue, if necessary, should be done separately.
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 446f43b309df..61bc4e943116 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,4 +1,17 @@
1This file details changes in 2.6 which affect PCMCIA card driver authors: 1This file details changes in 2.6 which affect PCMCIA card driver authors:
2* No dev_node_t (as of 2.6.35)
3 There is no more need to fill out a "dev_node_t" structure.
4
5* New IRQ request rules (as of 2.6.35)
6 Instead of the old pcmcia_request_irq() interface, drivers may now
7 choose between:
8 - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq.
9 - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will
10 clean up automatically on calls to pcmcia_disable_device() or
11 device ejection.
12 - drivers still not capable of IRQF_SHARED (or not telling us so) may
13 use the deprecated pcmcia_request_exclusive_irq() for the time
14 being; they might receive a shared IRQ nonetheless.
2 15
3* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) 16* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
4 Instead of the cs_error() callback or the CS_CHECK() macro, please use 17 Instead of the cs_error() callback or the CS_CHECK() macro, please use
diff --git a/Documentation/pcmcia/locking.txt b/Documentation/pcmcia/locking.txt
new file mode 100644
index 000000000000..68f622bc4064
--- /dev/null
+++ b/Documentation/pcmcia/locking.txt
@@ -0,0 +1,118 @@
1This file explains the locking and exclusion scheme used in the PCCARD
2and PCMCIA subsystems.
3
4
5A) Overview, Locking Hierarchy:
6===============================
7
8pcmcia_socket_list_rwsem - protects only the list of sockets
9- skt_mutex - serializes card insert / ejection
10 - ops_mutex - serializes socket operation
11
12
13B) Exclusion
14============
15
16The following functions and callbacks to struct pcmcia_socket must
17be called with "skt_mutex" held:
18
19 socket_detect_change()
20 send_event()
21 socket_reset()
22 socket_shutdown()
23 socket_setup()
24 socket_remove()
25 socket_insert()
26 socket_early_resume()
27 socket_late_resume()
28 socket_resume()
29 socket_suspend()
30
31 struct pcmcia_callback *callback
32
33The following functions and callbacks to struct pcmcia_socket must
34be called with "ops_mutex" held:
35
36 socket_reset()
37 socket_setup()
38
39 struct pccard_operations *ops
40 struct pccard_resource_ops *resource_ops;
41
42Note that send_event() and struct pcmcia_callback *callback must not be
43called with "ops_mutex" held.
44
45
46C) Protection
47=============
48
491. Global Data:
50---------------
51struct list_head pcmcia_socket_list;
52
53protected by pcmcia_socket_list_rwsem;
54
55
562. Per-Socket Data:
57-------------------
58The resource_ops and their data are protected by ops_mutex.
59
60The "main" struct pcmcia_socket is protected as follows (read-only fields
61or single-use fields not mentioned):
62
63- by pcmcia_socket_list_rwsem:
64 struct list_head socket_list;
65
66- by thread_lock:
67 unsigned int thread_events;
68
69- by skt_mutex:
70 u_int suspended_state;
71 void (*tune_bridge);
72 struct pcmcia_callback *callback;
73 int resume_status;
74
75- by ops_mutex:
76 socket_state_t socket;
77 u_int state;
78 u_short lock_count;
79 pccard_mem_map cis_mem;
80 void __iomem *cis_virt;
81 struct { } irq;
82 io_window_t io[];
83 pccard_mem_map win[];
84 struct list_head cis_cache;
85 size_t fake_cis_len;
86 u8 *fake_cis;
87 u_int irq_mask;
88 void (*zoom_video);
89 int (*power_hook);
90 u8 resource...;
91 struct list_head devices_list;
92 u8 device_count;
93 struct pcmcia_state;
94
95
963. Per PCMCIA-device Data:
97--------------------------
98
99The "main" struct pcmcia_devie is protected as follows (read-only fields
100or single-use fields not mentioned):
101
102
103- by pcmcia_socket->ops_mutex:
104 struct list_head socket_device_list;
105 struct config_t *function_config;
106 u16 _irq:1;
107 u16 _io:1;
108 u16 _win:4;
109 u16 _locked:1;
110 u16 allow_func_id_match:1;
111 u16 suspended:1;
112 u16 _removed:1;
113
114- by the PCMCIA driver:
115 io_req_t io;
116 irq_req_t irq;
117 config_req_t conf;
118 window_handle_t win;
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt
index a327db67782a..763e4659bf18 100644
--- a/Documentation/pnp.txt
+++ b/Documentation/pnp.txt
@@ -57,7 +57,7 @@ PC standard floppy disk controller
57# cat resources 57# cat resources
58DISABLED 58DISABLED
59 59
60- Notice the string "DISABLED". THis means the device is not active. 60- Notice the string "DISABLED". This means the device is not active.
61 61
623.) check the device's possible configurations (optional) 623.) check the device's possible configurations (optional)
63# cat options 63# cat options
@@ -139,7 +139,7 @@ Plug and Play but it is planned to be in the near future.
139 139
140Requirements for a Linux PnP protocol: 140Requirements for a Linux PnP protocol:
1411.) the protocol must use EISA IDs 1411.) the protocol must use EISA IDs
1422.) the protocol must inform the PnP Layer of a devices current configuration 1422.) the protocol must inform the PnP Layer of a device's current configuration
143- the ability to set resources is optional but preferred. 143- the ability to set resources is optional but preferred.
144 144
145The following are PnP protocol related functions: 145The following are PnP protocol related functions:
@@ -158,7 +158,7 @@ pnp_remove_device
158- automatically will free mem used by the device and related structures 158- automatically will free mem used by the device and related structures
159 159
160pnp_add_id 160pnp_add_id
161- adds a EISA ID to the list of supported IDs for the specified device 161- adds an EISA ID to the list of supported IDs for the specified device
162 162
163For more information consult the source of a protocol such as 163For more information consult the source of a protocol such as
164/drivers/pnp/pnpbios/core.c. 164/drivers/pnp/pnpbios/core.c.
@@ -167,7 +167,7 @@ For more information consult the source of a protocol such as
167 167
168Linux Plug and Play Drivers 168Linux Plug and Play Drivers
169--------------------------- 169---------------------------
170 This section contains information for linux PnP driver developers. 170 This section contains information for Linux PnP driver developers.
171 171
172The New Way 172The New Way
173........... 173...........
@@ -235,11 +235,10 @@ static int __init serial8250_pnp_init(void)
235The Old Way 235The Old Way
236........... 236...........
237 237
238a series of compatibility functions have been created to make it easy to convert 238A series of compatibility functions have been created to make it easy to convert
239
240ISAPNP drivers. They should serve as a temporary solution only. 239ISAPNP drivers. They should serve as a temporary solution only.
241 240
242they are as follows: 241They are as follows:
243 242
244struct pnp_card *pnp_find_card(unsigned short vendor, 243struct pnp_card *pnp_find_card(unsigned short vendor,
245 unsigned short device, 244 unsigned short device,
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index c9abbd86bc18..57080cd74575 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -1,7 +1,13 @@
1Device Power Management
2
3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
5
6
1Most of the code in Linux is device drivers, so most of the Linux power 7Most of the code in Linux is device drivers, so most of the Linux power
2management code is also driver-specific. Most drivers will do very little; 8management (PM) code is also driver-specific. Most drivers will do very
3others, especially for platforms with small batteries (like cell phones), 9little; others, especially for platforms with small batteries (like cell
4will do a lot. 10phones), will do a lot.
5 11
6This writeup gives an overview of how drivers interact with system-wide 12This writeup gives an overview of how drivers interact with system-wide
7power management goals, emphasizing the models and interfaces that are 13power management goals, emphasizing the models and interfaces that are
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power
15states: 21states:
16 22
17 System Sleep model: 23 System Sleep model:
18 Drivers can enter low power states as part of entering system-wide 24 Drivers can enter low-power states as part of entering system-wide
19 low-power states like "suspend-to-ram", or (mostly for systems with 25 low-power states like "suspend" (also known as "suspend-to-RAM"), or
20 disks) "hibernate" (suspend-to-disk). 26 (mostly for systems with disks) "hibernation" (also known as
27 "suspend-to-disk").
21 28
22 This is something that device, bus, and class drivers collaborate on 29 This is something that device, bus, and class drivers collaborate on
23 by implementing various role-specific suspend and resume methods to 30 by implementing various role-specific suspend and resume methods to
@@ -25,33 +32,41 @@ states:
25 them without loss of data. 32 them without loss of data.
26 33
27 Some drivers can manage hardware wakeup events, which make the system 34 Some drivers can manage hardware wakeup events, which make the system
28 leave that low-power state. This feature may be disabled using the 35 leave the low-power state. This feature may be enabled or disabled
29 relevant /sys/devices/.../power/wakeup file; enabling it may cost some 36 using the relevant /sys/devices/.../power/wakeup file (for Ethernet
30 power usage, but let the whole system enter low power states more often. 37 drivers the ioctl interface used by ethtool may also be used for this
38 purpose); enabling it may cost some power usage, but let the whole
39 system enter low-power states more often.
31 40
32 Runtime Power Management model: 41 Runtime Power Management model:
33 Drivers may also enter low power states while the system is running, 42 Devices may also be put into low-power states while the system is
34 independently of other power management activity. Upstream drivers 43 running, independently of other power management activity in principle.
35 will normally not know (or care) if the device is in some low power 44 However, devices are not generally independent of each other (for
36 state when issuing requests; the driver will auto-resume anything 45 example, a parent device cannot be suspended unless all of its child
37 that's needed when it gets a request. 46 devices have been suspended). Moreover, depending on the bus type the
38 47 device is on, it may be necessary to carry out some bus-specific
39 This doesn't have, or need much infrastructure; it's just something you 48 operations on the device for this purpose. Devices put into low power
40 should do when writing your drivers. For example, clk_disable() unused 49 states at run time may require special handling during system-wide power
41 clocks as part of minimizing power drain for currently-unused hardware. 50 transitions (suspend or hibernation).
42 Of course, sometimes clusters of drivers will collaborate with each 51
43 other, which could involve task-specific power management. 52 For these reasons not only the device driver itself, but also the
44 53 appropriate subsystem (bus type, device type or device class) driver and
45There's not a lot to be said about those low power states except that they 54 the PM core are involved in runtime power management. As in the system
46are very system-specific, and often device-specific. Also, that if enough 55 sleep power management case, they need to collaborate by implementing
47drivers put themselves into low power states (at "runtime"), the effect may be 56 various role-specific suspend and resume methods, so that the hardware
48the same as entering some system-wide low-power state (system sleep) ... and 57 is cleanly powered down and reactivated without data or service loss.
49that synergies exist, so that several drivers using runtime pm might put the 58
50system into a state where even deeper power saving options are available. 59There's not a lot to be said about those low-power states except that they are
51 60very system-specific, and often device-specific. Also, that if enough devices
52Most suspended devices will have quiesced all I/O: no more DMA or irqs, no 61have been put into low-power states (at runtime), the effect may be very similar
53more data read or written, and requests from upstream drivers are no longer 62to entering some system-wide low-power state (system sleep) ... and that
54accepted. A given bus or platform may have different requirements though. 63synergies exist, so that several drivers using runtime PM might put the system
64into a state where even deeper power saving options are available.
65
66Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
67for wakeup events), no more data read or written, and requests from upstream
68drivers are no longer accepted. A given bus or platform may have different
69requirements though.
55 70
56Examples of hardware wakeup events include an alarm from a real time clock, 71Examples of hardware wakeup events include an alarm from a real time clock,
57network wake-on-LAN packets, keyboard or mouse activity, and media insertion 72network wake-on-LAN packets, keyboard or mouse activity, and media insertion
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on).
60 75
61Interfaces for Entering System Sleep States 76Interfaces for Entering System Sleep States
62=========================================== 77===========================================
63Most of the programming interfaces a device driver needs to know about 78There are programming interfaces provided for subsystems (bus type, device type,
64relate to that first model: entering a system-wide low power state, 79device class) and device drivers to allow them to participate in the power
65rather than just minimizing power consumption by one device. 80management of devices they are concerned with. These interfaces cover both
66 81system sleep and runtime power management.
67 82
68Bus Driver Methods 83
69------------------ 84Device Power Management Operations
70The core methods to suspend and resume devices reside in struct bus_type. 85----------------------------------
71These are mostly of interest to people writing infrastructure for busses 86Device power management operations, at the subsystem level as well as at the
72like PCI or USB, or because they define the primitives that device drivers 87device driver level, are implemented by defining and populating objects of type
73may need to apply in domain-specific ways to their devices: 88struct dev_pm_ops:
74 89
75struct bus_type { 90struct dev_pm_ops {
76 ... 91 int (*prepare)(struct device *dev);
77 int (*suspend)(struct device *dev, pm_message_t state); 92 void (*complete)(struct device *dev);
78 int (*resume)(struct device *dev); 93 int (*suspend)(struct device *dev);
94 int (*resume)(struct device *dev);
95 int (*freeze)(struct device *dev);
96 int (*thaw)(struct device *dev);
97 int (*poweroff)(struct device *dev);
98 int (*restore)(struct device *dev);
99 int (*suspend_noirq)(struct device *dev);
100 int (*resume_noirq)(struct device *dev);
101 int (*freeze_noirq)(struct device *dev);
102 int (*thaw_noirq)(struct device *dev);
103 int (*poweroff_noirq)(struct device *dev);
104 int (*restore_noirq)(struct device *dev);
105 int (*runtime_suspend)(struct device *dev);
106 int (*runtime_resume)(struct device *dev);
107 int (*runtime_idle)(struct device *dev);
79}; 108};
80 109
81Bus drivers implement those methods as appropriate for the hardware and 110This structure is defined in include/linux/pm.h and the methods included in it
82the drivers using it; PCI works differently from USB, and so on. Not many 111are also described in that file. Their roles will be explained in what follows.
83people write bus drivers; most driver code is a "device driver" that 112For now, it should be sufficient to remember that the last three methods are
84builds on top of bus-specific framework code. 113specific to runtime power management while the remaining ones are used during
114system-wide power transitions.
85 115
86For more information on these driver calls, see the description later; 116There also is a deprecated "old" or "legacy" interface for power management
87they are called in phases for every device, respecting the parent-child 117operations available at least for some subsystems. This approach does not use
88sequencing in the driver model tree. Note that as this is being written, 118struct dev_pm_ops objects and it is suitable only for implementing system sleep
89only the suspend() and resume() are widely available; not many bus drivers 119power management methods. Therefore it is not described in this document, so
90leverage all of those phases, or pass them down to lower driver levels. 120please refer directly to the source code for more information about it.
91 121
92 122
93/sys/devices/.../power/wakeup files 123Subsystem-Level Methods
94----------------------------------- 124-----------------------
95All devices in the driver model have two flags to control handling of 125The core methods to suspend and resume devices reside in struct dev_pm_ops
96wakeup events, which are hardware signals that can force the device and/or 126pointed to by the pm member of struct bus_type, struct device_type and
97system out of a low power state. These are initialized by bus or device 127struct class. They are mostly of interest to the people writing infrastructure
98driver code using device_init_wakeup(dev,can_wakeup). 128for buses, like PCI or USB, or device type and device class drivers.
99 129
100The "can_wakeup" flag just records whether the device (and its driver) can 130Bus drivers implement these methods as appropriate for the hardware and the
101physically support wakeup events. When that flag is clear, the sysfs 131drivers using it; PCI works differently from USB, and so on. Not many people
102"wakeup" file is empty, and device_may_wakeup() returns false. 132write subsystem-level drivers; most driver code is a "device driver" that builds
133on top of bus-specific framework code.
103 134
104For devices that can issue wakeup events, a separate flag controls whether 135For more information on these driver calls, see the description later;
105that device should try to use its wakeup mechanism. The initial value of 136they are called in phases for every device, respecting the parent-child
106device_may_wakeup() will be true, so that the device's "wakeup" file holds 137sequencing in the driver model tree.
107the value "enabled". Userspace can change that to "disabled" so that
108device_may_wakeup() returns false; or change it back to "enabled" (so that
109it returns true again).
110 138
111 139
112EXAMPLE: PCI Device Driver Methods 140/sys/devices/.../power/wakeup files
113----------------------------------- 141-----------------------------------
114PCI framework software calls these methods when the PCI device driver bound 142All devices in the driver model have two flags to control handling of wakeup
115to a device device has provided them: 143events (hardware signals that can force the device and/or system out of a low
116 144power state). These flags are initialized by bus or device driver code using
117struct pci_driver { 145device_set_wakeup_capable() and device_set_wakeup_enable(), defined in
118 ... 146include/linux/pm_wakeup.h.
119 int (*suspend)(struct pci_device *pdev, pm_message_t state);
120 int (*suspend_late)(struct pci_device *pdev, pm_message_t state);
121 147
122 int (*resume_early)(struct pci_device *pdev); 148The "can_wakeup" flag just records whether the device (and its driver) can
123 int (*resume)(struct pci_device *pdev); 149physically support wakeup events. The device_set_wakeup_capable() routine
124}; 150affects this flag. The "should_wakeup" flag controls whether the device should
125 151try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag;
126Drivers will implement those methods, and call PCI-specific procedures 152for the most part drivers should not change its value. The initial value of
127like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and 153should_wakeup is supposed to be false for the majority of devices; the major
128pci_restore_state() to manage PCI-specific mechanisms. (PCI config space 154exceptions are power buttons, keyboards, and Ethernet adapters whose WoL
129could be saved during driver probe, if it weren't for the fact that some 155(wake-on-LAN) feature has been set up with ethtool.
130systems rely on userspace tweaking using setpci.) Devices are suspended 156
131before their bridges enter low power states, and likewise bridges resume 157Whether or not a device is capable of issuing wakeup events is a hardware
132before their devices. 158matter, and the kernel is responsible for keeping track of it. By contrast,
133 159whether or not a wakeup-capable device should issue wakeup events is a policy
134 160decision, and it is managed by user space through a sysfs attribute: the
135Upper Layers of Driver Stacks 161power/wakeup file. User space can write the strings "enabled" or "disabled" to
136----------------------------- 162set or clear the should_wakeup flag, respectively. Reads from the file will
137Device drivers generally have at least two interfaces, and the methods 163return the corresponding string if can_wakeup is true, but if can_wakeup is
138sketched above are the ones which apply to the lower level (nearer PCI, USB, 164false then reads will return an empty string, to indicate that the device
139or other bus hardware). The network and block layers are examples of upper 165doesn't support wakeup events. (But even though the file appears empty, writes
140level interfaces, as is a character device talking to userspace. 166will still affect the should_wakeup flag.)
141 167
142Power management requests normally need to flow through those upper levels, 168The device_may_wakeup() routine returns true only if both flags are set.
143which often use domain-oriented requests like "blank that screen". In 169Drivers should check this routine when putting devices in a low-power state
144some cases those upper levels will have power management intelligence that 170during a system sleep transition, to see whether or not to enable the devices'
145relates to end-user activity, or other devices that work in cooperation. 171wakeup mechanisms. However for runtime power management, wakeup events should
146 172be enabled whenever the device and driver both support them, regardless of the
147When those interfaces are structured using class interfaces, there is a 173should_wakeup flag.
148standard way to have the upper layer stop issuing requests to a given 174
149class device (and restart later): 175
150 176/sys/devices/.../power/control files
151struct class { 177------------------------------------
152 ... 178Each device in the driver model has a flag to control whether it is subject to
153 int (*suspend)(struct device *dev, pm_message_t state); 179runtime power management. This flag, called runtime_auto, is initialized by the
154 int (*resume)(struct device *dev); 180bus type (or generally subsystem) code using pm_runtime_allow() or
155}; 181pm_runtime_forbid(); the default is to allow runtime power management.
156 182
157Those calls are issued in specific phases of the process by which the 183The setting can be adjusted by user space by writing either "on" or "auto" to
158system enters a low power "suspend" state, or resumes from it. 184the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(),
159 185setting the flag and allowing the device to be runtime power-managed by its
160 186driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning
161Calling Drivers to Enter System Sleep States 187the device to full power if it was in a low-power state, and preventing the
162============================================ 188device from being runtime power-managed. User space can check the current value
163When the system enters a low power state, each device's driver is asked 189of the runtime_auto flag by reading the file.
164to suspend the device by putting it into state compatible with the target 190
191The device's runtime_auto flag has no effect on the handling of system-wide
192power transitions. In particular, the device can (and in the majority of cases
193should and will) be put into a low-power state during a system-wide transition
194to a sleep state even though its runtime_auto flag is clear.
195
196For more information about the runtime power management framework, refer to
197Documentation/power/runtime_pm.txt.
198
199
200Calling Drivers to Enter and Leave System Sleep States
201======================================================
202When the system goes into a sleep state, each device's driver is asked to
203suspend the device by putting it into a state compatible with the target
165system state. That's usually some version of "off", but the details are 204system state. That's usually some version of "off", but the details are
166system-specific. Also, wakeup-enabled devices will usually stay partly 205system-specific. Also, wakeup-enabled devices will usually stay partly
167functional in order to wake the system. 206functional in order to wake the system.
168 207
169When the system leaves that low power state, the device's driver is asked 208When the system leaves that low-power state, the device's driver is asked to
170to resume it. The suspend and resume operations always go together, and 209resume it by returning it to full power. The suspend and resume operations
171both are multi-phase operations. 210always go together, and both are multi-phase operations.
172 211
173For simple drivers, suspend might quiesce the device using the class code 212For simple drivers, suspend might quiesce the device using class code
174and then turn its hardware as "off" as possible with late_suspend. The 213and then turn its hardware as "off" as possible during suspend_noirq. The
175matching resume calls would then completely reinitialize the hardware 214matching resume calls would then completely reinitialize the hardware
176before reactivating its class I/O queues. 215before reactivating its class I/O queues.
177 216
178More power-aware drivers drivers will use more than one device low power 217More power-aware drivers might prepare the devices for triggering system wakeup
179state, either at runtime or during system sleep states, and might trigger 218events.
180system wakeup events.
181 219
182 220
183Call Sequence Guarantees 221Call Sequence Guarantees
184------------------------ 222------------------------
185To ensure that bridges and similar links needed to talk to a device are 223To ensure that bridges and similar links needing to talk to a device are
186available when the device is suspended or resumed, the device tree is 224available when the device is suspended or resumed, the device tree is
187walked in a bottom-up order to suspend devices. A top-down order is 225walked in a bottom-up order to suspend devices. A top-down order is
188used to resume those devices. 226used to resume those devices.
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent.
194The policy is that the device tree should match hardware bus topology. 232The policy is that the device tree should match hardware bus topology.
195(Or at least the control bus, for devices which use multiple busses.) 233(Or at least the control bus, for devices which use multiple busses.)
196In particular, this means that a device registration may fail if the parent of 234In particular, this means that a device registration may fail if the parent of
197the device is suspending (ie. has been chosen by the PM core as the next 235the device is suspending (i.e. has been chosen by the PM core as the next
198device to suspend) or has already suspended, as well as after all of the other 236device to suspend) or has already suspended, as well as after all of the other
199devices have been suspended. Device drivers must be prepared to cope with such 237devices have been suspended. Device drivers must be prepared to cope with such
200situations. 238situations.
201 239
202 240
203Suspending Devices 241System Power Management Phases
204------------------ 242------------------------------
205Suspending a given device is done in several phases. Suspending the 243Suspending or resuming the system is done in several phases. Different phases
206system always includes every phase, executing calls for every device 244are used for standby or memory sleep states ("suspend-to-RAM") and the
207before the next phase begins. Not all busses or classes support all 245hibernation state ("suspend-to-disk"). Each phase involves executing callbacks
208these callbacks; and not all drivers use all the callbacks. 246for every device before the next phase begins. Not all busses or classes
247support all these callbacks and not all drivers use all the callbacks. The
248various phases always run after tasks have been frozen and before they are
249unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have
250been disabled (except for those marked with the IRQ_WAKEUP flag).
209 251
210The phases are seen by driver notifications issued in this order: 252Most phases use bus, type, and class callbacks (that is, methods defined in
253dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete
254phases are exceptions; they use only bus callbacks. When multiple callbacks
255are used in a phase, they are invoked in the order: <class, type, bus> during
256power-down transitions and in the opposite order during power-up transitions.
257For example, during the suspend phase the PM core invokes
211 258
212 1 class.suspend(dev, message) is called after tasks are frozen, for 259 dev->class->pm.suspend(dev);
213 devices associated with a class that has such a method. This 260 dev->type->pm.suspend(dev);
214 method may sleep. 261 dev->bus->pm.suspend(dev);
215 262
216 Since I/O activity usually comes from such higher layers, this is 263before moving on to the next device, whereas during the resume phase the core
217 a good place to quiesce all drivers of a given type (and keep such 264invokes
218 code out of those drivers).
219 265
220 2 bus.suspend(dev, message) is called next. This method may sleep, 266 dev->bus->pm.resume(dev);
221 and is often morphed into a device driver call with bus-specific 267 dev->type->pm.resume(dev);
222 parameters and/or rules. 268 dev->class->pm.resume(dev);
223 269
224 This call should handle parts of device suspend logic that require 270These callbacks may in turn invoke device- or driver-specific methods stored in
225 sleeping. It probably does work to quiesce the device which hasn't 271dev->driver->pm, but they don't have to.
226 been abstracted into class.suspend().
227 272
228The pm_message_t parameter is currently used to refine those semantics
229(described later).
230 273
231At the end of those phases, drivers should normally have stopped all I/O 274Entering System Suspend
232transactions (DMA, IRQs), saved enough state that they can re-initialize 275-----------------------
233or restore previous state (as needed by the hardware), and placed the 276When the system goes into the standby or memory sleep state, the phases are:
234device into a low-power state. On many platforms they will also use 277
235clk_disable() to gate off one or more clock sources; sometimes they will 278 prepare, suspend, suspend_noirq.
236also switch off power supplies, or reduce voltages. Drivers which have 279
237runtime PM support may already have performed some or all of the steps 280 1. The prepare phase is meant to prevent races by preventing new devices
238needed to prepare for the upcoming system sleep state. 281 from being registered; the PM core would never know that all the
282 children of a device had been suspended if new children could be
283 registered at will. (By contrast, devices may be unregistered at any
284 time.) Unlike the other suspend-related phases, during the prepare
285 phase the device tree is traversed top-down.
286
287 The prepare phase uses only a bus callback. After the callback method
288 returns, no new children may be registered below the device. The method
289 may also prepare the device or driver in some way for the upcoming
290 system power transition, but it should not put the device into a
291 low-power state.
292
293 2. The suspend methods should quiesce the device to stop it from performing
294 I/O. They also may save the device registers and put it into the
295 appropriate low-power state, depending on the bus type the device is on,
296 and they may enable wakeup events.
297
298 3. The suspend_noirq phase occurs after IRQ handlers have been disabled,
299 which means that the driver's interrupt handler will not be called while
300 the callback method is running. The methods should save the values of
301 the device's registers that weren't saved previously and finally put the
302 device into the appropriate low-power state.
303
304 The majority of subsystems and device drivers need not implement this
305 callback. However, bus types allowing devices to share interrupt
306 vectors, like PCI, generally need it; otherwise a driver might encounter
307 an error during the suspend phase by fielding a shared interrupt
308 generated by some other device after its own device had been set to low
309 power.
310
311At the end of these phases, drivers should have stopped all I/O transactions
312(DMA, IRQs), saved enough state that they can re-initialize or restore previous
313state (as needed by the hardware), and placed the device into a low-power state.
314On many platforms they will gate off one or more clock sources; sometimes they
315will also switch off power supplies or reduce voltages. (Drivers supporting
316runtime PM may already have performed some or all of these steps.)
317
318If device_may_wakeup(dev) returns true, the device should be prepared for
319generating hardware wakeup signals to trigger a system wakeup event when the
320system is in the sleep state. For example, enable_irq_wake() might identify
321GPIO signals hooked up to a switch or other external hardware, and
322pci_enable_wake() does something similar for the PCI PME signal.
323
324If any of these callbacks returns an error, the system won't enter the desired
325low-power state. Instead the PM core will unwind its actions by resuming all
326the devices that were suspended.
327
328
329Leaving System Suspend
330----------------------
331When resuming from standby or memory sleep, the phases are:
332
333 resume_noirq, resume, complete.
334
335 1. The resume_noirq callback methods should perform any actions needed
336 before the driver's interrupt handlers are invoked. This generally
337 means undoing the actions of the suspend_noirq phase. If the bus type
338 permits devices to share interrupt vectors, like PCI, the method should
339 bring the device and its driver into a state in which the driver can
340 recognize if the device is the source of incoming interrupts, if any,
341 and handle them correctly.
342
343 For example, the PCI bus type's ->pm.resume_noirq() puts the device into
344 the full-power state (D0 in the PCI terminology) and restores the
345 standard configuration registers of the device. Then it calls the
346 device driver's ->pm.resume_noirq() method to perform device-specific
347 actions.
348
349 2. The resume methods should bring the the device back to its operating
350 state, so that it can perform normal I/O. This generally involves
351 undoing the actions of the suspend phase.
352
353 3. The complete phase uses only a bus callback. The method should undo the
354 actions of the prepare phase. Note, however, that new children may be
355 registered below the device as soon as the resume callbacks occur; it's
356 not necessary to wait until the complete phase.
357
358At the end of these phases, drivers should be as functional as they were before
359suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
360gated on. Even if the device was in a low-power state before the system sleep
361because of runtime power management, afterwards it should be back in its
362full-power state. There are multiple reasons why it's best to do this; they are
363discussed in more detail in Documentation/power/runtime_pm.txt.
239 364
240When any driver sees that its device_can_wakeup(dev), it should make sure 365However, the details here may again be platform-specific. For example,
241to use the relevant hardware signals to trigger a system wakeup event. 366some systems support multiple "run" states, and the mode in effect at
242For example, enable_irq_wake() might identify GPIO signals hooked up to 367the end of resume might not be the one which preceded suspension.
243a switch or other external hardware, and pci_enable_wake() does something 368That means availability of certain clocks or power supplies changed,
244similar for PCI's PME# signal. 369which could easily affect how a driver works.
370
371Drivers need to be able to handle hardware which has been reset since the
372suspend methods were called, for example by complete reinitialization.
373This may be the hardest part, and the one most protected by NDA'd documents
374and chip errata. It's simplest if the hardware state hasn't changed since
375the suspend was carried out, but that can't be guaranteed (in fact, it ususally
376is not the case).
377
378Drivers must also be prepared to notice that the device has been removed
379while the system was powered down, whenever that's physically possible.
380PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
381where common Linux platforms will see such removal. Details of how drivers
382will notice and handle such removals are currently bus-specific, and often
383involve a separate thread.
384
385These callbacks may return an error value, but the PM core will ignore such
386errors since there's nothing it can do about them other than printing them in
387the system log.
388
389
390Entering Hibernation
391--------------------
392Hibernating the system is more complicated than putting it into the standby or
393memory sleep state, because it involves creating and saving a system image.
394Therefore there are more phases for hibernation, with a different set of
395callbacks. These phases always run after tasks have been frozen and memory has
396been freed.
397
398The general procedure for hibernation is to quiesce all devices (freeze), create
399an image of the system memory while everything is stable, reactivate all
400devices (thaw), write the image to permanent storage, and finally shut down the
401system (poweroff). The phases used to accomplish this are:
402
403 prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
404 prepare, poweroff, poweroff_noirq
405
406 1. The prepare phase is discussed in the "Entering System Suspend" section
407 above.
408
409 2. The freeze methods should quiesce the device so that it doesn't generate
410 IRQs or DMA, and they may need to save the values of device registers.
411 However the device does not have to be put in a low-power state, and to
412 save time it's best not to do so. Also, the device should not be
413 prepared to generate wakeup events.
414
415 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed
416 above, except again that the device should not be put in a low-power
417 state and should not be allowed to generate wakeup events.
418
419At this point the system image is created. All devices should be inactive and
420the contents of memory should remain undisturbed while this happens, so that the
421image forms an atomic snapshot of the system state.
422
423 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed
424 above. The main difference is that its methods can assume the device is
425 in the same state as at the end of the freeze_noirq phase.
426
427 5. The thaw phase is analogous to the resume phase discussed above. Its
428 methods should bring the device back to an operating state, so that it
429 can be used for saving the image if necessary.
430
431 6. The complete phase is discussed in the "Leaving System Suspend" section
432 above.
433
434At this point the system image is saved, and the devices then need to be
435prepared for the upcoming system shutdown. This is much like suspending them
436before putting the system into the standby or memory sleep state, and the phases
437are similar.
438
439 7. The prepare phase is discussed above.
440
441 8. The poweroff phase is analogous to the suspend phase.
442
443 9. The poweroff_noirq phase is analogous to the suspend_noirq phase.
444
445The poweroff and poweroff_noirq callbacks should do essentially the same things
446as the suspend and suspend_noirq callbacks. The only notable difference is that
447they need not store the device register values, because the registers should
448already have been stored during the freeze or freeze_noirq phases.
449
450
451Leaving Hibernation
452-------------------
453Resuming from hibernation is, again, more complicated than resuming from a sleep
454state in which the contents of main memory are preserved, because it requires
455a system image to be loaded into memory and the pre-hibernation memory contents
456to be restored before control can be passed back to the image kernel.
457
458Although in principle, the image might be loaded into memory and the
459pre-hibernation memory contents restored by the boot loader, in practice this
460can't be done because boot loaders aren't smart enough and there is no
461established protocol for passing the necessary information. So instead, the
462boot loader loads a fresh instance of the kernel, called the boot kernel, into
463memory and passes control to it in the usual way. Then the boot kernel reads
464the system image, restores the pre-hibernation memory contents, and passes
465control to the image kernel. Thus two different kernels are involved in
466resuming from hibernation. In fact, the boot kernel may be completely different
467from the image kernel: a different configuration and even a different version.
468This has important consequences for device drivers and their subsystems.
469
470To be able to load the system image into memory, the boot kernel needs to
471include at least a subset of device drivers allowing it to access the storage
472medium containing the image, although it doesn't need to include all of the
473drivers present in the image kernel. After the image has been loaded, the
474devices managed by the boot kernel need to be prepared for passing control back
475to the image kernel. This is very similar to the initial steps involved in
476creating a system image, and it is accomplished in the same way, using prepare,
477freeze, and freeze_noirq phases. However the devices affected by these phases
478are only those having drivers in the boot kernel; other devices will still be in
479whatever state the boot loader left them.
480
481Should the restoration of the pre-hibernation memory contents fail, the boot
482kernel would go through the "thawing" procedure described above, using the
483thaw_noirq, thaw, and complete phases, and then continue running normally. This
484happens only rarely. Most often the pre-hibernation memory contents are
485restored successfully and control is passed to the image kernel, which then
486becomes responsible for bringing the system back to the working state.
487
488To achieve this, the image kernel must restore the devices' pre-hibernation
489functionality. The operation is much like waking up from the memory sleep
490state, although it involves different phases:
491
492 restore_noirq, restore, complete
493
494 1. The restore_noirq phase is analogous to the resume_noirq phase.
495
496 2. The restore phase is analogous to the resume phase.
497
498 3. The complete phase is discussed above.
499
500The main difference from resume[_noirq] is that restore[_noirq] must assume the
501device has been accessed and reconfigured by the boot loader or the boot kernel.
502Consequently the state of the device may be different from the state remembered
503from the freeze and freeze_noirq phases. The device may even need to be reset
504and completely re-initialized. In many cases this difference doesn't matter, so
505the resume[_noirq] and restore[_norq] method pointers can be set to the same
506routines. Nevertheless, different callback pointers are used in case there is a
507situation where it actually matters.
245 508
246If a driver (or bus, or class) fails it suspend method, the system won't
247enter the desired low power state; it will resume all the devices it's
248suspended so far.
249 509
250Note that drivers may need to perform different actions based on the target 510System Devices
251system lowpower/sleep state. At this writing, there are only platform 511--------------
252specific APIs through which drivers could determine those target states. 512System devices (sysdevs) follow a slightly different API, which can be found in
513
514 include/linux/sysdev.h
515 drivers/base/sys.c
516
517System devices will be suspended with interrupts disabled, and after all other
518devices have been suspended. On resume, they will be resumed before any other
519devices, and also with interrupts disabled. These things occur in special
520"sysdev_driver" phases, which affect only system devices.
521
522Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when
523the non-boot CPUs are all offline and IRQs are disabled on the remaining online
524CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a
525sleep state (or a system image is created). During resume (or after the image
526has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs
527are enabled on the only online CPU, the non-boot CPUs are enabled, and the
528resume_noirq (or thaw_noirq or restore_noirq) phase begins.
529
530Code to actually enter and exit the system-wide low power state sometimes
531involves hardware details that are only known to the boot firmware, and
532may leave a CPU running software (from SRAM or flash memory) that monitors
533the system and manages its wakeup sequence.
253 534
254 535
255Device Low Power (suspend) States 536Device Low Power (suspend) States
256--------------------------------- 537---------------------------------
257Device low-power states aren't very standard. One device might only handle 538Device low-power states aren't standard. One device might only handle
258"on" and "off, while another might support a dozen different versions of 539"on" and "off, while another might support a dozen different versions of
259"on" (how many engines are active?), plus a state that gets back to "on" 540"on" (how many engines are active?), plus a state that gets back to "on"
260faster than from a full "off". 541faster than from a full "off".
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it
265issues would be issued through the PME# bus signal. Plus, there are 546issues would be issued through the PME# bus signal. Plus, there are
266several PCI-standard device states, some of which are optional. 547several PCI-standard device states, some of which are optional.
267 548
268In contrast, integrated system-on-chip processors often use irqs as the 549In contrast, integrated system-on-chip processors often use IRQs as the
269wakeup event sources (so drivers would call enable_irq_wake) and might 550wakeup event sources (so drivers would call enable_irq_wake) and might
270be able to treat DMA completion as a wakeup event (sometimes DMA can stay 551be able to treat DMA completion as a wakeup event (sometimes DMA can stay
271active too, it'd only be the CPU and some peripherals that sleep). 552active too, it'd only be the CPU and some peripherals that sleep).
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby",
284but a different product using the same SOC might work differently. 565but a different product using the same SOC might work differently.
285 566
286 567
287Meaning of pm_message_t.event 568Power Management Notifiers
288----------------------------- 569--------------------------
289Parameters to suspend calls include the device affected and a message of 570There are some operations that cannot be carried out by the power management
290type pm_message_t, which has one field: the event. If driver does not 571callbacks discussed above, because the callbacks occur too late or too early.
291recognize the event code, suspend calls may abort the request and return 572To handle these cases, subsystems and device drivers may register power
292a negative errno. However, most drivers will be fine if they implement 573management notifiers that are called before tasks are frozen and after they have
293PM_EVENT_SUSPEND semantics for all messages. 574been thawed. Generally speaking, the PM notifiers are suitable for performing
575actions that either require user space to be available, or at least won't
576interfere with user space.
294 577
295The event codes are used to refine the goal of suspending the device, and 578For details refer to Documentation/power/notifiers.txt.
296mostly matter when creating or resuming system memory image snapshots, as
297used with suspend-to-disk:
298
299 PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power
300 state. When used with system sleep states like "suspend-to-RAM" or
301 "standby", the upcoming resume() call will often be able to rely on
302 state kept in hardware, or issue system wakeup events.
303
304 PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup
305 events as appropriate. It is only used with hibernation
306 (suspend-to-disk) and few devices are able to wake up the system from
307 this state; most are completely powered off.
308
309 PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into
310 any low power mode. A system snapshot is about to be taken, often
311 followed by a call to the driver's resume() method. Neither wakeup
312 events nor DMA are allowed.
313
314 PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume()
315 will restore a suspend-to-disk snapshot from a different kernel image.
316 Drivers that are smart enough to look at their hardware state during
317 resume() processing need that state to be correct ... a PRETHAW could
318 be used to invalidate that state (by resetting the device), like a
319 shutdown() invocation would before a kexec() or system halt. Other
320 drivers might handle this the same way as PM_EVENT_FREEZE. Neither
321 wakeup events nor DMA are allowed.
322
323To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or
324the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event
325codes are used for hibernation ("Suspend to Disk", STD, ACPI S4).
326
327There's also PM_EVENT_ON, a value which never appears as a suspend event
328but is sometimes used to record the "not suspended" device state.
329
330
331Resuming Devices
332----------------
333Resuming is done in multiple phases, much like suspending, with all
334devices processing each phase's calls before the next phase begins.
335
336The phases are seen by driver notifications issued in this order:
337
338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
339 be morphed into a device driver call with bus-specific parameters;
340 implementations may sleep.
341
342 2 class.resume(dev) is called for devices associated with a class
343 that has such a method. Implementations may sleep.
344
345 This reverses the effects of class.suspend(), and would usually
346 reactivate the device's I/O queue.
347
348At the end of those phases, drivers should normally be as functional as
349they were before suspending: I/O can be performed using DMA and IRQs, and
350the relevant clocks are gated on. The device need not be "fully on"; it
351might be in a runtime lowpower/suspend state that acts as if it were.
352
353However, the details here may again be platform-specific. For example,
354some systems support multiple "run" states, and the mode in effect at
355the end of resume() might not be the one which preceded suspension.
356That means availability of certain clocks or power supplies changed,
357which could easily affect how a driver works.
358
359
360Drivers need to be able to handle hardware which has been reset since the
361suspend methods were called, for example by complete reinitialization.
362This may be the hardest part, and the one most protected by NDA'd documents
363and chip errata. It's simplest if the hardware state hasn't changed since
364the suspend() was called, but that can't always be guaranteed.
365
366Drivers must also be prepared to notice that the device has been removed
367while the system was powered off, whenever that's physically possible.
368PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
369where common Linux platforms will see such removal. Details of how drivers
370will notice and handle such removals are currently bus-specific, and often
371involve a separate thread.
372
373
374Note that the bus-specific runtime PM wakeup mechanism can exist, and might
375be defined to share some of the same driver code as for system wakeup. For
376example, a bus-specific device driver's resume() method might be used there,
377so it wouldn't only be called from bus.resume() during system-wide wakeup.
378See bus-specific information about how runtime wakeup events are handled.
379
380
381System Devices
382--------------
383System devices follow a slightly different API, which can be found in
384
385 include/linux/sysdev.h
386 drivers/base/sys.c
387
388System devices will only be suspended with interrupts disabled, and after
389all other devices have been suspended. On resume, they will be resumed
390before any other devices, and also with interrupts disabled.
391
392That is, IRQs are disabled, the suspend_late() phase begins, then the
393sysdev_driver.suspend() phase, and the system enters a sleep state. Then
394the sysdev_driver.resume() phase begins, followed by the resume_early()
395phase, after which IRQs are enabled.
396
397Code to actually enter and exit the system-wide low power state sometimes
398involves hardware details that are only known to the boot firmware, and
399may leave a CPU running software (from SRAM or flash memory) that monitors
400the system and manages its wakeup sequence.
401 579
402 580
403Runtime Power Management 581Runtime Power Management
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and
407can offer significant power savings on a running system. These devices 585can offer significant power savings on a running system. These devices
408often support a range of runtime power states, which might use names such 586often support a range of runtime power states, which might use names such
409as "off", "sleep", "idle", "active", and so on. Those states will in some 587as "off", "sleep", "idle", "active", and so on. Those states will in some
410cases (like PCI) be partially constrained by a bus the device uses, and will 588cases (like PCI) be partially constrained by the bus the device uses, and will
411usually include hardware states that are also used in system sleep states. 589usually include hardware states that are also used in system sleep states.
412 590
413However, note that if a driver puts a device into a runtime low power state 591A system-wide power transition can be started while some devices are in low
414and the system then goes into a system-wide sleep state, it normally ought 592power states due to runtime power management. The system sleep PM callbacks
415to resume into that runtime low power state rather than "full on". Such 593should recognize such situations and react to them appropriately, but the
416distinctions would be part of the driver-internal state machine for that 594necessary actions are subsystem-specific.
417hardware; the whole point of runtime power management is to be sure that 595
418drivers are decoupled in that way from the state machine governing phases 596In some cases the decision may be made at the subsystem level while in other
419of the system-wide power/sleep state transitions. 597cases the device driver may be left to decide. In some cases it may be
420 598desirable to leave a suspended device in that state during a system-wide power
421 599transition, but in other cases the device must be put back into the full-power
422Power Saving Techniques 600state temporarily, for example so that its system wakeup capability can be
423----------------------- 601disabled. This all depends on the hardware and the design of the subsystem and
424Normally runtime power management is handled by the drivers without specific 602device driver in question.
425userspace or kernel intervention, by device-aware use of techniques like: 603
426 604During system-wide resume from a sleep state it's best to put devices into the
427 Using information provided by other system layers 605full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to
428 - stay deeply "off" except between open() and close() 606that document for more information regarding this particular issue as well as
429 - if transceiver/PHY indicates "nobody connected", stay "off" 607for information on the device runtime power management framework in general.
430 - application protocols may include power commands or hints
431
432 Using fewer CPU cycles
433 - using DMA instead of PIO
434 - removing timers, or making them lower frequency
435 - shortening "hot" code paths
436 - eliminating cache misses
437 - (sometimes) offloading work to device firmware
438
439 Reducing other resource costs
440 - gating off unused clocks in software (or hardware)
441 - switching off unused power supplies
442 - eliminating (or delaying/merging) IRQs
443 - tuning DMA to use word and/or burst modes
444
445 Using device-specific low power states
446 - using lower voltages
447 - avoiding needless DMA transfers
448
449Read your hardware documentation carefully to see the opportunities that
450may be available. If you can, measure the actual power usage and check
451it against the budget established for your project.
452
453
454Examples: USB hosts, system timer, system CPU
455----------------------------------------------
456USB host controllers make interesting, if complex, examples. In many cases
457these have no work to do: no USB devices are connected, or all of them are
458in the USB "suspend" state. Linux host controller drivers can then disable
459periodic DMA transfers that would otherwise be a constant power drain on the
460memory subsystem, and enter a suspend state. In power-aware controllers,
461entering that suspend state may disable the clock used with USB signaling,
462saving a certain amount of power.
463
464The controller will be woken from that state (with an IRQ) by changes to the
465signal state on the data lines of a given port, for example by an existing
466peripheral requesting "remote wakeup" or by plugging a new peripheral. The
467same wakeup mechanism usually works from "standby" sleep states, and on some
468systems also from "suspend to RAM" (or even "suspend to disk") states.
469(Except that ACPI may be involved instead of normal IRQs, on some hardware.)
470
471System devices like timers and CPUs may have special roles in the platform
472power management scheme. For example, system timers using a "dynamic tick"
473approach don't just save CPU cycles (by eliminating needless timer IRQs),
474but they may also open the door to using lower power CPU "idle" states that
475cost more than a jiffie to enter and exit. On x86 systems these are states
476like "C3"; note that periodic DMA transfers from a USB host controller will
477also prevent entry to a C3 state, much like a periodic timer IRQ.
478
479That kind of runtime mechanism interaction is common. "System On Chip" (SOC)
480processors often have low power idle modes that can't be entered unless
481certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the
482drivers gate those clocks effectively, then the system idle task may be able
483to use the lower power idle modes and thereby increase battery life.
484
485If the CPU can have a "cpufreq" driver, there also may be opportunities
486to shift to lower voltage settings and reduce the power cost of executing
487a given number of instructions. (Without voltage adjustment, it's rare
488for cpufreq to save much power; the cost-per-instruction must go down.)
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index dd8fe43888d3..62328d76b55b 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -1,299 +1,1025 @@
1
2PCI Power Management 1PCI Power Management
3~~~~~~~~~~~~~~~~~~~~
4 2
5An overview of the concepts and the related functions in the Linux kernel 3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4
5An overview of concepts and the Linux kernel's interfaces related to PCI power
6management. Based on previous work by Patrick Mochel <mochel@transmeta.com>
7(and others).
6 8
7Patrick Mochel <mochel@transmeta.com> 9This document only covers the aspects of power management specific to PCI
8(and others) 10devices. For general description of the kernel's interfaces related to device
11power management refer to Documentation/power/devices.txt and
12Documentation/power/runtime_pm.txt.
9 13
10--------------------------------------------------------------------------- 14---------------------------------------------------------------------------
11 15
121. Overview 161. Hardware and Platform Support for PCI Power Management
132. How the PCI Subsystem Does Power Management 172. PCI Subsystem and Device Power Management
143. PCI Utility Functions 183. PCI Device Drivers and Power Management
154. PCI Device Drivers 194. Resources
165. Resources 20
17 21
181. Overview 221. Hardware and Platform Support for PCI Power Management
19~~~~~~~~~~~ 23=========================================================
20 24
21The PCI Power Management Specification was introduced between the PCI 2.1 and 251.1. Native and Platform-Based Power Management
22PCI 2.2 Specifications. It a standard interface for controlling various 26-----------------------------------------------
23power management operations. 27In general, power management is a feature allowing one to save energy by putting
24 28devices into states in which they draw less power (low-power states) at the
25Implementation of the PCI PM Spec is optional, as are several sub-components of 29price of reduced functionality or performance.
26it. If a device supports the PCI PM Spec, the device will have an 8 byte 30
27capability field in its PCI configuration space. This field is used to describe 31Usually, a device is put into a low-power state when it is underutilized or
28and control the standard PCI power management features. 32completely inactive. However, when it is necessary to use the device once
29 33again, it has to be put back into the "fully functional" state (full-power
30The PCI PM spec defines 4 operating states for devices (D0 - D3) and for buses 34state). This may happen when there are some data for the device to handle or
31(B0 - B3). The higher the number, the less power the device consumes. However, 35as a result of an external event requiring the device to be active, which may
32the higher the number, the longer the latency is for the device to return to 36be signaled by the device itself.
33an operational state (D0). 37
34 38PCI devices may be put into low-power states in two ways, by using the device
35There are actually two D3 states. When someone talks about D3, they usually 39capabilities introduced by the PCI Bus Power Management Interface Specification,
36mean D3hot, which corresponds to an ACPI D2 state (power is reduced, the 40or with the help of platform firmware, such as an ACPI BIOS. In the first
37device may lose some context). But they may also mean D3cold, which is an 41approach, that is referred to as the native PCI power management (native PCI PM)
38ACPI D3 state (power is fully off, all state was discarded); or both. 42in what follows, the device power state is changed as a result of writing a
39 43specific value into one of its standard configuration registers. The second
40Bus power management is not covered in this version of this document. 44approach requires the platform firmware to provide special methods that may be
41 45used by the kernel to change the device's power state.
42Note that all PCI devices support D0 and D3cold by default, regardless of 46
43whether or not they implement any of the PCI PM spec. 47Devices supporting the native PCI PM usually can generate wakeup signals called
44 48Power Management Events (PMEs) to let the kernel know about external events
45The possible state transitions that a device can undergo are: 49requiring the device to be active. After receiving a PME the kernel is supposed
46 50to put the device that sent it into the full-power state. However, the PCI Bus
47+---------------------------+ 51Power Management Interface Specification doesn't define any standard method of
48| Current State | New State | 52delivering the PME from the device to the CPU and the operating system kernel.
49+---------------------------+ 53It is assumed that the platform firmware will perform this task and therefore,
50| D0 | D1, D2, D3| 54even though a PCI device is set up to generate PMEs, it also may be necessary to
51+---------------------------+ 55prepare the platform firmware for notifying the CPU of the PMEs coming from the
52| D1 | D2, D3 | 56device (e.g. by generating interrupts).
53+---------------------------+ 57
54| D2 | D3 | 58In turn, if the methods provided by the platform firmware are used for changing
55+---------------------------+ 59the power state of a device, usually the platform also provides a method for
56| D1, D2, D3 | D0 | 60preparing the device to generate wakeup signals. In that case, however, it
57+---------------------------+ 61often also is necessary to prepare the device for generating PMEs using the
58 62native PCI PM mechanism, because the method provided by the platform depends on
59Note that when the system is entering a global suspend state, all devices will 63that.
60be placed into D3 and when resuming, all devices will be placed into D0. 64
61However, when the system is running, other state transitions are possible. 65Thus in many situations both the native and the platform-based power management
62 66mechanisms have to be used simultaneously to obtain the desired result.
632. How The PCI Subsystem Handles Power Management 67
64~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 681.2. Native PCI Power Management
65 69--------------------------------
66The PCI suspend/resume functionality is accessed indirectly via the Power 70The PCI Bus Power Management Interface Specification (PCI PM Spec) was
67Management subsystem. At boot, the PCI driver registers a power management 71introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a
68callback with that layer. Upon entering a suspend state, the PM layer iterates 72standard interface for performing various operations related to power
69through all of its registered callbacks. This currently takes place only during 73management.
70APM state transitions. 74
71 75The implementation of the PCI PM Spec is optional for conventional PCI devices,
72Upon going to sleep, the PCI subsystem walks its device tree twice. Both times, 76but it is mandatory for PCI Express devices. If a device supports the PCI PM
73it does a depth first walk of the device tree. The first walk saves each of the 77Spec, it has an 8 byte power management capability field in its PCI
74device's state and checks for devices that will prevent the system from entering 78configuration space. This field is used to describe and control the standard
75a global power state. The next walk then places the devices in a low power 79features related to the native PCI power management.
80
81The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
82(B0-B3). The higher the number, the less power is drawn by the device or bus
83in that state. However, the higher the number, the longer the latency for
84the device or bus to return to the full-power state (D0 or B0, respectively).
85
86There are two variants of the D3 state defined by the specification. The first
87one is D3hot, referred to as the software accessible D3, because devices can be
88programmed to go into it. The second one, D3cold, is the state that PCI devices
89are in when the supply voltage (Vcc) is removed from them. It is not possible
90to program a PCI device to go into D3cold, although there may be a programmable
91interface for putting the bus the device is on into a state in which Vcc is
92removed from all devices on the bus.
93
94PCI bus power management, however, is not supported by the Linux kernel at the
95time of this writing and therefore it is not covered by this document.
96
97Note that every PCI device can be in the full-power state (D0) or in D3cold,
98regardless of whether or not it implements the PCI PM Spec. In addition to
99that, if the PCI PM Spec is implemented by the device, it must support D3hot
100as well as D0. The support for the D1 and D2 power states is optional.
101
102PCI devices supporting the PCI PM Spec can be programmed to go to any of the
103supported low-power states (except for D3cold). While in D1-D3hot the
104standard configuration registers of the device must be accessible to software
105(i.e. the device is required to respond to PCI configuration accesses), although
106its I/O and memory spaces are then disabled. This allows the device to be
107programmatically put into D0. Thus the kernel can switch the device back and
108forth between D0 and the supported low-power states (except for D3cold) and the
109possible power state transitions the device can undergo are the following:
110
111+----------------------------+
112| Current State | New State |
113+----------------------------+
114| D0 | D1, D2, D3 |
115+----------------------------+
116| D1 | D2, D3 |
117+----------------------------+
118| D2 | D3 |
119+----------------------------+
120| D1, D2, D3 | D0 |
121+----------------------------+
122
123The transition from D3cold to D0 occurs when the supply voltage is provided to
124the device (i.e. power is restored). In that case the device returns to D0 with
125a full power-on reset sequence and the power-on defaults are restored to the
126device by hardware just as at initial power up.
127
128PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
129while in a low-power state (D1-D3), but they are not required to be capable
130of generating PMEs from all supported low-power states. In particular, the
131capability of generating PMEs from D3cold is optional and depends on the
132presence of additional voltage (3.3Vaux) allowing the device to remain
133sufficiently active to generate a wakeup signal.
134
1351.3. ACPI Device Power Management
136---------------------------------
137The platform firmware support for the power management of PCI devices is
138system-specific. However, if the system in question is compliant with the
139Advanced Configuration and Power Interface (ACPI) Specification, like the
140majority of x86-based systems, it is supposed to implement device power
141management interfaces defined by the ACPI standard.
142
143For this purpose the ACPI BIOS provides special functions called "control
144methods" that may be executed by the kernel to perform specific tasks, such as
145putting a device into a low-power state. These control methods are encoded
146using special byte-code language called the ACPI Machine Language (AML) and
147stored in the machine's BIOS. The kernel loads them from the BIOS and executes
148them as needed using an AML interpreter that translates the AML byte code into
149computations and memory or I/O space accesses. This way, in theory, a BIOS
150writer can provide the kernel with a means to perform actions depending
151on the system design in a system-specific fashion.
152
153ACPI control methods may be divided into global control methods, that are not
154associated with any particular devices, and device control methods, that have
155to be defined separately for each device supposed to be handled with the help of
156the platform. This means, in particular, that ACPI device control methods can
157only be used to handle devices that the BIOS writer knew about in advance. The
158ACPI methods used for device power management fall into that category.
159
160The ACPI specification assumes that devices can be in one of four power states
161labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
162D0-D3 states (although the difference between D3hot and D3cold is not taken
163into account by ACPI). Moreover, for each power state of a device there is a
164set of power resources that have to be enabled for the device to be put into
165that state. These power resources are controlled (i.e. enabled or disabled)
166with the help of their own control methods, _ON and _OFF, that have to be
167defined individually for each of them.
168
169To put a device into the ACPI power state Dx (where x is a number between 0 and
1703 inclusive) the kernel is supposed to (1) enable the power resources required
171by the device in this state using their _ON control methods and (2) execute the
172_PSx control method defined for the device. In addition to that, if the device
173is going to be put into a low-power state (D1-D3) and is supposed to generate
174wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
1753.0) control method defined for it has to be executed before _PSx. Power
176resources that are not required by the device in the target power state and are
177not required any more by any other device should be disabled (by executing their
178_OFF control methods). If the current power state of the device is D3, it can
179only be put into D0 this way.
180
181However, quite often the power states of devices are changed during a
182system-wide transition into a sleep state or back into the working state. ACPI
183defines four system sleep states, S1, S2, S3, and S4, and denotes the system
184working state as S0. In general, the target system sleep (or working) state
185determines the highest power (lowest number) state the device can be put
186into and the kernel is supposed to obtain this information by executing the
187device's _SxD control method (where x is a number between 0 and 4 inclusive).
188If the device is required to wake up the system from the target sleep state, the
189lowest power (highest number) state it can be put into is also determined by the
190target state of the system. The kernel is then supposed to use the device's
191_SxW control method to obtain the number of that state. It also is supposed to
192use the device's _PRW control method to learn which power resources need to be
193enabled for the device to be able to generate wakeup signals.
194
1951.4. Wakeup Signaling
196---------------------
197Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
198a result of the execution of the _DSW (or _PSW) ACPI control method before
199putting the device into a low-power state, have to be caught and handled as
200appropriate. If they are sent while the system is in the working state
201(ACPI S0), they should be translated into interrupts so that the kernel can
202put the devices generating them into the full-power state and take care of the
203events that triggered them. In turn, if they are sent while the system is
204sleeping, they should cause the system's core logic to trigger wakeup.
205
206On ACPI-based systems wakeup signals sent by conventional PCI devices are
207converted into ACPI General-Purpose Events (GPEs) which are hardware signals
208from the system core logic generated in response to various events that need to
209be acted upon. Every GPE is associated with one or more sources of potentially
210interesting events. In particular, a GPE may be associated with a PCI device
211capable of signaling wakeup. The information on the connections between GPEs
212and event sources is recorded in the system's ACPI BIOS from where it can be
213read by the kernel.
214
215If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
216associated with it (if there is one) is triggered. The GPEs associated with PCI
217bridges may also be triggered in response to a wakeup signal from one of the
218devices below the bridge (this also is the case for root bridges) and, for
219example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
220handled this way.
221
222A GPE may be triggered when the system is sleeping (i.e. when it is in one of
223the ACPI S1-S4 states), in which case system wakeup is started by its core logic
224(the device that was the source of the signal causing the system wakeup to occur
225may be identified later). The GPEs used in such situations are referred to as
226wakeup GPEs.
227
228Usually, however, GPEs are also triggered when the system is in the working
229state (ACPI S0) and in that case the system's core logic generates a System
230Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI
231handler identifies the GPE that caused the interrupt to be generated which,
232in turn, allows the kernel to identify the source of the event (that may be
233a PCI device signaling wakeup). The GPEs used for notifying the kernel of
234events occurring while the system is in the working state are referred to as
235runtime GPEs.
236
237Unfortunately, there is no standard way of handling wakeup signals sent by
238conventional PCI devices on systems that are not ACPI-based, but there is one
239for PCI Express devices. Namely, the PCI Express Base Specification introduced
240a native mechanism for converting native PCI PMEs into interrupts generated by
241root ports. For conventional PCI devices native PMEs are out-of-band, so they
242are routed separately and they need not pass through bridges (in principle they
243may be routed directly to the system's core logic), but for PCI Express devices
244they are in-band messages that have to pass through the PCI Express hierarchy,
245including the root port on the path from the device to the Root Complex. Thus
246it was possible to introduce a mechanism by which a root port generates an
247interrupt whenever it receives a PME message from one of the devices below it.
248The PCI Express Requester ID of the device that sent the PME message is then
249recorded in one of the root port's configuration registers from where it may be
250read by the interrupt handler allowing the device to be identified. [PME
251messages sent by PCI Express endpoints integrated with the Root Complex don't
252pass through root ports, but instead they cause a Root Complex Event Collector
253(if there is one) to generate interrupts.]
254
255In principle the native PCI Express PME signaling may also be used on ACPI-based
256systems along with the GPEs, but to use it the kernel has to ask the system's
257ACPI BIOS to release control of root port configuration registers. The ACPI
258BIOS, however, is not required to allow the kernel to control these registers
259and if it doesn't do that, the kernel must not modify their contents. Of course
260the native PCI Express PME signaling cannot be used by the kernel in that case.
261
262
2632. PCI Subsystem and Device Power Management
264============================================
265
2662.1. Device Power Management Callbacks
267--------------------------------------
268The PCI Subsystem participates in the power management of PCI devices in a
269number of ways. First of all, it provides an intermediate code layer between
270the device power management core (PM core) and PCI device drivers.
271Specifically, the pm field of the PCI subsystem's struct bus_type object,
272pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
273pointers to several device power management callbacks:
274
275const struct dev_pm_ops pci_dev_pm_ops = {
276 .prepare = pci_pm_prepare,
277 .complete = pci_pm_complete,
278 .suspend = pci_pm_suspend,
279 .resume = pci_pm_resume,
280 .freeze = pci_pm_freeze,
281 .thaw = pci_pm_thaw,
282 .poweroff = pci_pm_poweroff,
283 .restore = pci_pm_restore,
284 .suspend_noirq = pci_pm_suspend_noirq,
285 .resume_noirq = pci_pm_resume_noirq,
286 .freeze_noirq = pci_pm_freeze_noirq,
287 .thaw_noirq = pci_pm_thaw_noirq,
288 .poweroff_noirq = pci_pm_poweroff_noirq,
289 .restore_noirq = pci_pm_restore_noirq,
290 .runtime_suspend = pci_pm_runtime_suspend,
291 .runtime_resume = pci_pm_runtime_resume,
292 .runtime_idle = pci_pm_runtime_idle,
293};
294
295These callbacks are executed by the PM core in various situations related to
296device power management and they, in turn, execute power management callbacks
297provided by PCI device drivers. They also perform power management operations
298involving some standard configuration registers of PCI devices that device
299drivers need not know or care about.
300
301The structure representing a PCI device, struct pci_dev, contains several fields
302that these callbacks operate on:
303
304struct pci_dev {
305 ...
306 pci_power_t current_state; /* Current operating state. */
307 int pm_cap; /* PM capability offset in the
308 configuration space */
309 unsigned int pme_support:5; /* Bitmask of states from which PME#
310 can be generated */
311 unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */
312 unsigned int d1_support:1; /* Low power state D1 is supported */
313 unsigned int d2_support:1; /* Low power state D2 is supported */
314 unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
315 unsigned int wakeup_prepared:1; /* Device prepared for wake up */
316 unsigned int d3_delay; /* D3->D0 transition time in ms */
317 ...
318};
319
320They also indirectly use some fields of the struct device that is embedded in
321struct pci_dev.
322
3232.2. Device Initialization
324--------------------------
325The PCI subsystem's first task related to device power management is to
326prepare the device for power management and initialize the fields of struct
327pci_dev used for this purpose. This happens in two functions defined in
328drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
329
330The first of these functions checks if the device supports native PCI PM
331and if that's the case the offset of its power management capability structure
332in the configuration space is stored in the pm_cap field of the device's struct
333pci_dev object. Next, the function checks which PCI low-power states are
334supported by the device and from which low-power states the device can generate
335native PCI PMEs. The power management fields of the device's struct pci_dev and
336the struct device embedded in it are updated accordingly and the generation of
337PMEs by the device is disabled.
338
339The second function checks if the device can be prepared to signal wakeup with
340the help of the platform firmware, such as the ACPI BIOS. If that is the case,
341the function updates the wakeup fields in struct device embedded in the
342device's struct pci_dev and uses the firmware-provided method to prevent the
343device from signaling wakeup.
344
345At this point the device is ready for power management. For driverless devices,
346however, this functionality is limited to a few basic operations carried out
347during system-wide transitions to a sleep state and back to the working state.
348
3492.3. Runtime Device Power Management
350------------------------------------
351The PCI subsystem plays a vital role in the runtime power management of PCI
352devices. For this purpose it uses the general runtime power management
353(runtime PM) framework described in Documentation/power/runtime_pm.txt.
354Namely, it provides subsystem-level callbacks:
355
356 pci_pm_runtime_suspend()
357 pci_pm_runtime_resume()
358 pci_pm_runtime_idle()
359
360that are executed by the core runtime PM routines. It also implements the
361entire mechanics necessary for handling runtime wakeup signals from PCI devices
362in low-power states, which at the time of this writing works for both the native
363PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
364Section 1.
365
366First, a PCI device is put into a low-power state, or suspended, with the help
367of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
368pci_pm_runtime_suspend() to do the actual job. For this to work, the device's
369driver has to provide a pm->runtime_suspend() callback (see below), which is
370run by pci_pm_runtime_suspend() as the first action. If the driver's callback
371returns successfully, the device's standard configuration registers are saved,
372the device is prepared to generate wakeup signals and, finally, it is put into
373the target low-power state.
374
375The low-power state to put the device into is the lowest-power (highest number)
376state from which it can signal wakeup. The exact method of signaling wakeup is
377system-dependent and is determined by the PCI subsystem on the basis of the
378reported capabilities of the device and the platform firmware. To prepare the
379device for signaling wakeup and put it into the selected low-power state, the
380PCI subsystem can use the platform firmware as well as the device's native PCI
381PM capabilities, if supported.
382
383It is expected that the device driver's pm->runtime_suspend() callback will
384not attempt to prepare the device for signaling wakeup or to put it into a
385low-power state. The driver ought to leave these tasks to the PCI subsystem
386that has all of the information necessary to perform them.
387
388A suspended device is brought back into the "active" state, or resumed,
389with the help of pm_request_resume() or pm_runtime_resume() which both call
390pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's
391driver provides a pm->runtime_resume() callback (see below). However, before
392the driver's callback is executed, pci_pm_runtime_resume() brings the device
393back into the full-power state, prevents it from signaling wakeup while in that
394state and restores its standard configuration registers. Thus the driver's
395callback need not worry about the PCI-specific aspects of the device resume.
396
397Note that generally pci_pm_runtime_resume() may be called in two different
398situations. First, it may be called at the request of the device's driver, for
399example if there are some data for it to process. Second, it may be called
400as a result of a wakeup signal from the device itself (this sometimes is
401referred to as "remote wakeup"). Of course, for this purpose the wakeup signal
402is handled in one of the ways described in Section 1 and finally converted into
403a notification for the PCI subsystem after the source device has been
404identified.
405
406The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
407and pm_request_idle(), executes the device driver's pm->runtime_idle()
408callback, if defined, and if that callback doesn't return error code (or is not
409present at all), suspends the device with the help of pm_runtime_suspend().
410Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
411example, it is called right after the device has just been resumed), in which
412cases it is expected to suspend the device if that makes sense. Usually,
413however, the PCI subsystem doesn't really know if the device really can be
414suspended, so it lets the device's driver decide by running its
415pm->runtime_idle() callback.
416
4172.4. System-Wide Power Transitions
418----------------------------------
419There are a few different types of system-wide power transitions, described in
420Documentation/power/devices.txt. Each of them requires devices to be handled
421in a specific way and the PM core executes subsystem-level power management
422callbacks for this purpose. They are executed in phases such that each phase
423involves executing the same subsystem-level callback for every device belonging
424to the given subsystem before the next phase begins. These phases always run
425after tasks have been frozen.
426
4272.4.1. System Suspend
428
429When the system is going into a sleep state in which the contents of memory will
430be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
431
432 prepare, suspend, suspend_noirq.
433
434The following PCI bus type's callbacks, respectively, are used in these phases:
435
436 pci_pm_prepare()
437 pci_pm_suspend()
438 pci_pm_suspend_noirq()
439
440The pci_pm_prepare() routine first puts the device into the "fully functional"
441state with the help of pm_runtime_resume(). Then, it executes the device
442driver's pm->prepare() callback if defined (i.e. if the driver's struct
443dev_pm_ops object is present and the prepare pointer in that object is valid).
444
445The pci_pm_suspend() routine first checks if the device's driver implements
446legacy PCI suspend routines (see Section 3), in which case the driver's legacy
447suspend callback is executed, if present, and its result is returned. Next, if
448the device's driver doesn't provide a struct dev_pm_ops object (containing
449pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
450simply turns off the device's bus master capability and runs
451pcibios_disable_device() to disable it, unless the device is a bridge (PCI
452bridges are ignored by this routine). Next, the device driver's pm->suspend()
453callback is executed, if defined, and its result is returned if it fails.
454Finally, pci_fixup_device() is called to apply hardware suspend quirks related
455to the device if necessary.
456
457Note that the suspend phase is carried out asynchronously for PCI devices, so
458the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
459devices that don't depend on each other in a known way (i.e. none of the paths
460in the device tree from the root bridge to a leaf device contains both of them).
461
462The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
463been called, which means that the device driver's interrupt handler won't be
464invoked while this routine is running. It first checks if the device's driver
465implements legacy PCI suspends routines (Section 3), in which case the legacy
466late suspend routine is called and its result is returned (the standard
467configuration registers of the device are saved if the driver's callback hasn't
468done that). Second, if the device driver's struct dev_pm_ops object is not
469present, the device's standard configuration registers are saved and the routine
470returns success. Otherwise the device driver's pm->suspend_noirq() callback is
471executed, if present, and its result is returned if it fails. Next, if the
472device's standard configuration registers haven't been saved yet (one of the
473device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
474saves them, prepares the device to signal wakeup (if necessary) and puts it into
475a low-power state.
476
477The low-power state to put the device into is the lowest-power (highest number)
478state from which it can signal wakeup while the system is in the target sleep
479state. Just like in the runtime PM case described above, the mechanism of
480signaling wakeup is system-dependent and determined by the PCI subsystem, which
481is also responsible for preparing the device to signal wakeup from the system's
482target sleep state as appropriate.
483
484PCI device drivers (that don't implement legacy power management callbacks) are
485generally not expected to prepare devices for signaling wakeup or to put them
486into low-power states. However, if one of the driver's suspend callbacks
487(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
488registers, pci_pm_suspend_noirq() will assume that the device has been prepared
489to signal wakeup and put into a low-power state by the driver (the driver is
490then assumed to have used the helper functions provided by the PCI subsystem for
491this purpose). PCI device drivers are not encouraged to do that, but in some
492rare cases doing that in the driver may be the optimum approach.
493
4942.4.2. System Resume
495
496When the system is undergoing a transition from a sleep state in which the
497contents of memory have been preserved, such as one of the ACPI sleep states
498S1-S3, into the working state (ACPI S0), the phases are:
499
500 resume_noirq, resume, complete.
501
502The following PCI bus type's callbacks, respectively, are executed in these
503phases:
504
505 pci_pm_resume_noirq()
506 pci_pm_resume()
507 pci_pm_complete()
508
509The pci_pm_resume_noirq() routine first puts the device into the full-power
510state, restores its standard configuration registers and applies early resume
511hardware quirks related to the device, if necessary. This is done
512unconditionally, regardless of whether or not the device's driver implements
513legacy PCI power management callbacks (this way all PCI devices are in the
514full-power state and their standard configuration registers have been restored
515when their interrupt handlers are invoked for the first time during resume,
516which allows the kernel to avoid problems with the handling of shared interrupts
517by drivers whose devices are still suspended). If legacy PCI power management
518callbacks (see Section 3) are implemented by the device's driver, the legacy
519early resume callback is executed and its result is returned. Otherwise, the
520device driver's pm->resume_noirq() callback is executed, if defined, and its
521result is returned.
522
523The pci_pm_resume() routine first checks if the device's standard configuration
524registers have been restored and restores them if that's not the case (this
525only is necessary in the error path during a failing suspend). Next, resume
526hardware quirks related to the device are applied, if necessary, and if the
527device's driver implements legacy PCI power management callbacks (see
528Section 3), the driver's legacy resume callback is executed and its result is
529returned. Otherwise, the device's wakeup signaling mechanisms are blocked and
530its driver's pm->resume() callback is executed, if defined (the callback's
531result is then returned).
532
533The resume phase is carried out asynchronously for PCI devices, like the
534suspend phase described above, which means that if two PCI devices don't depend
535on each other in a known way, the pci_pm_resume() routine may be executed for
536the both of them in parallel.
537
538The pci_pm_complete() routine only executes the device driver's pm->complete()
539callback, if defined.
540
5412.4.3. System Hibernation
542
543System hibernation is more complicated than system suspend, because it requires
544a system image to be created and written into a persistent storage medium. The
545image is created atomically and all devices are quiesced, or frozen, before that
546happens.
547
548The freezing of devices is carried out after enough memory has been freed (at
549the time of this writing the image creation requires at least 50% of system RAM
550to be free) in the following three phases:
551
552 prepare, freeze, freeze_noirq
553
554that correspond to the PCI bus type's callbacks:
555
556 pci_pm_prepare()
557 pci_pm_freeze()
558 pci_pm_freeze_noirq()
559
560This means that the prepare phase is exactly the same as for system suspend.
561The other two phases, however, are different.
562
563The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
564the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
565and it doesn't apply the suspend-related hardware quirks. It is executed
566asynchronously for different PCI devices that don't depend on each other in a
567known way.
568
569The pci_pm_freeze_noirq() routine, in turn, is similar to
570pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
571routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the
572device for signaling wakeup and put it into a low-power state. Still, it saves
573the device's standard configuration registers if they haven't been saved by one
574of the driver's callbacks.
575
576Once the image has been created, it has to be saved. However, at this point all
577devices are frozen and they cannot handle I/O, while their ability to handle
578I/O is obviously necessary for the image saving. Thus they have to be brought
579back to the fully functional state and this is done in the following phases:
580
581 thaw_noirq, thaw, complete
582
583using the following PCI bus type's callbacks:
584
585 pci_pm_thaw_noirq()
586 pci_pm_thaw()
587 pci_pm_complete()
588
589respectively.
590
591The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
592but it doesn't put the device into the full power state and doesn't attempt to
593restore its standard configuration registers. It also executes the device
594driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
595
596The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
597driver's pm->thaw() callback instead of pm->resume(). It is executed
598asynchronously for different PCI devices that don't depend on each other in a
599known way.
600
601The complete phase it the same as for system resume.
602
603After saving the image, devices need to be powered down before the system can
604enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in
605three phases:
606
607 prepare, poweroff, poweroff_noirq
608
609where the prepare phase is exactly the same as for system suspend. The other
610two phases are analogous to the suspend and suspend_noirq phases, respectively.
611The PCI subsystem-level callbacks they correspond to
612
613 pci_pm_poweroff()
614 pci_pm_poweroff_noirq()
615
616work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
617although they don't attempt to save the device's standard configuration
618registers.
619
6202.4.4. System Restore
621
622System restore requires a hibernation image to be loaded into memory and the
623pre-hibernation memory contents to be restored before the pre-hibernation system
624activity can be resumed.
625
626As described in Documentation/power/devices.txt, the hibernation image is loaded
627into memory by a fresh instance of the kernel, called the boot kernel, which in
628turn is loaded and run by a boot loader in the usual way. After the boot kernel
629has loaded the image, it needs to replace its own code and data with the code
630and data of the "hibernated" kernel stored within the image, called the image
631kernel. For this purpose all devices are frozen just like before creating
632the image during hibernation, in the
633
634 prepare, freeze, freeze_noirq
635
636phases described above. However, the devices affected by these phases are only
637those having drivers in the boot kernel; other devices will still be in whatever
638state the boot loader left them.
639
640Should the restoration of the pre-hibernation memory contents fail, the boot
641kernel would go through the "thawing" procedure described above, using the
642thaw_noirq, thaw, and complete phases (that will only affect the devices having
643drivers in the boot kernel), and then continue running normally.
644
645If the pre-hibernation memory contents are restored successfully, which is the
646usual situation, control is passed to the image kernel, which then becomes
647responsible for bringing the system back to the working state. To achieve this,
648it must restore the devices' pre-hibernation functionality, which is done much
649like waking up from the memory sleep state, although it involves different
650phases:
651
652 restore_noirq, restore, complete
653
654The first two of these are analogous to the resume_noirq and resume phases
655described above, respectively, and correspond to the following PCI subsystem
656callbacks:
657
658 pci_pm_restore_noirq()
659 pci_pm_restore()
660
661These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
662respectively, but they execute the device driver's pm->restore_noirq() and
663pm->restore() callbacks, if available.
664
665The complete phase is carried out in exactly the same way as during system
666resume.
667
668
6693. PCI Device Drivers and Power Management
670==========================================
671
6723.1. Power Management Callbacks
673-------------------------------
674PCI device drivers participate in power management by providing callbacks to be
675executed by the PCI subsystem's power management routines described above and by
676controlling the runtime power management of their devices.
677
678At the time of this writing there are two ways to define power management
679callbacks for a PCI device driver, the recommended one, based on using a
680dev_pm_ops structure described in Documentation/power/devices.txt, and the
681"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
682.resume() callbacks from struct pci_driver are used. The legacy approach,
683however, doesn't allow one to define runtime power management callbacks and is
684not really suitable for any new drivers. Therefore it is not covered by this
685document (refer to the source code to learn more about it).
686
687It is recommended that all PCI device drivers define a struct dev_pm_ops object
688containing pointers to power management (PM) callbacks that will be executed by
689the PCI subsystem's PM routines in various circumstances. A pointer to the
690driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
691its struct pci_driver object. Once that has happened, the "legacy" PM callbacks
692in struct pci_driver are ignored (even if they are not NULL).
693
694The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
695defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
696subsystem will handle the device in a simplified default manner. If they are
697defined, though, they are expected to behave as described in the following
698subsections.
699
7003.1.1. prepare()
701
702The prepare() callback is executed during system suspend, during hibernation
703(when a hibernation image is about to be created), during power-off after
704saving a hibernation image and during system restore, when a hibernation image
705has just been loaded into memory.
706
707This callback is only necessary if the driver's device has children that in
708general may be registered at any time. In that case the role of the prepare()
709callback is to prevent new children of the device from being registered until
710one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
711
712In addition to that the prepare() callback may carry out some operations
713preparing the device to be suspended, although it should not allocate memory
714(if additional memory is required to suspend the device, it has to be
715preallocated earlier, for example in a suspend/hibernate notifier as described
716in Documentation/power/notifiers.txt).
717
7183.1.2. suspend()
719
720The suspend() callback is only executed during system suspend, after prepare()
721callbacks have been executed for all devices in the system.
722
723This callback is expected to quiesce the device and prepare it to be put into a
724low-power state by the PCI subsystem. It is not required (in fact it even is
725not recommended) that a PCI driver's suspend() callback save the standard
726configuration registers of the device, prepare it for waking up the system, or
727put it into a low-power state. All of these operations can very well be taken
728care of by the PCI subsystem, without the driver's participation.
729
730However, in some rare case it is convenient to carry out these operations in
731a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and
732pci_set_power_state() should be used to save the device's standard configuration
733registers, to prepare it for system wakeup (if necessary), and to put it into a
734low-power state, respectively. Moreover, if the driver calls pci_save_state(),
735the PCI subsystem will not execute either pci_prepare_to_sleep(), or
736pci_set_power_state() for its device, so the driver is then responsible for
737handling the device as appropriate.
738
739While the suspend() callback is being executed, the driver's interrupt handler
740can be invoked to handle an interrupt from the device, so all suspend-related
741operations relying on the driver's ability to handle interrupts should be
742carried out in this callback.
743
7443.1.3. suspend_noirq()
745
746The suspend_noirq() callback is only executed during system suspend, after
747suspend() callbacks have been executed for all devices in the system and
748after device interrupts have been disabled by the PM core.
749
750The difference between suspend_noirq() and suspend() is that the driver's
751interrupt handler will not be invoked while suspend_noirq() is running. Thus
752suspend_noirq() can carry out operations that would cause race conditions to
753arise if they were performed in suspend().
754
7553.1.4. freeze()
756
757The freeze() callback is hibernation-specific and is executed in two situations,
758during hibernation, after prepare() callbacks have been executed for all devices
759in preparation for the creation of a system image, and during restore,
760after a system image has been loaded into memory from persistent storage and the
761prepare() callbacks have been executed for all devices.
762
763The role of this callback is analogous to the role of the suspend() callback
764described above. In fact, they only need to be different in the rare cases when
765the driver takes the responsibility for putting the device into a low-power
76state. 766state.
77 767
78The first walk allows a graceful recovery in the event of a failure, since none 768In that cases the freeze() callback should not prepare the device system wakeup
79of the devices have actually been powered down. 769or put it into a low-power state. Still, either it or freeze_noirq() should
80 770save the device's standard configuration registers using pci_save_state().
81In both walks, in particular the second, all children of a bridge are touched
82before the actual bridge itself. This allows the bridge to retain power while
83its children are being accessed.
84
85Upon resuming from sleep, just the opposite must be true: all bridges must be
86powered on and restored before their children are powered on. This is easily
87accomplished with a breadth-first walk of the PCI device tree.
88
89
903. PCI Utility Functions
91~~~~~~~~~~~~~~~~~~~~~~~~
92
93These are helper functions designed to be called by individual device drivers.
94Assuming that a device behaves as advertised, these should be applicable in most
95cases. However, results may vary.
96
97Note that these functions are never implicitly called for the driver. The driver
98is always responsible for deciding when and if to call these.
99
100
101pci_save_state
102--------------
103
104Usage:
105 pci_save_state(struct pci_dev *dev);
106
107Description:
108 Save first 64 bytes of PCI config space, along with any additional
109 PCI-Express or PCI-X information.
110
111
112pci_restore_state
113-----------------
114
115Usage:
116 pci_restore_state(struct pci_dev *dev);
117
118Description:
119 Restore previously saved config space.
120
121
122pci_set_power_state
123-------------------
124
125Usage:
126 pci_set_power_state(struct pci_dev *dev, pci_power_t state);
127
128Description:
129 Transition device to low power state using PCI PM Capabilities
130 registers.
131
132 Will fail under one of the following conditions:
133 - If state is less than current state, but not D0 (illegal transition)
134 - Device doesn't support PM Capabilities
135 - Device does not support requested state
136
137
138pci_enable_wake
139---------------
140
141Usage:
142 pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
143
144Description:
145 Enable device to generate PME# during low power state using PCI PM
146 Capabilities.
147
148 Checks whether if device supports generating PME# from requested state
149 and fail if it does not, unless enable == 0 (request is to disable wake
150 events, which is implicit if it doesn't even support it in the first
151 place).
152
153 Note that the PMC Register in the device's PM Capabilities has a bitmask
154 of the states it supports generating PME# from. D3hot is bit 3 and
155 D3cold is bit 4. So, while a value of 4 as the state may not seem
156 semantically correct, it is.
157
158
1594. PCI Device Drivers
160~~~~~~~~~~~~~~~~~~~~~
161
162These functions are intended for use by individual drivers, and are defined in
163struct pci_driver:
164
165 int (*suspend) (struct pci_dev *dev, pm_message_t state);
166 int (*resume) (struct pci_dev *dev);
167
168
169suspend
170-------
171
172Usage:
173
174if (dev->driver && dev->driver->suspend)
175 dev->driver->suspend(dev,state);
176
177A driver uses this function to actually transition the device into a low power
178state. This should include disabling I/O, IRQs, and bus-mastering, as well as
179physically transitioning the device to a lower power state; it may also include
180calls to pci_enable_wake().
181
182Bus mastering may be disabled by doing:
183
184pci_disable_device(dev);
185
186For devices that support the PCI PM Spec, this may be used to set the device's
187power state to match the suspend() parameter:
188
189pci_set_power_state(dev,state);
190
191The driver is also responsible for disabling any other device-specific features
192(e.g blanking screen, turning off on-card memory, etc).
193
194The driver should be sure to track the current state of the device, as it may
195obviate the need for some operations.
196
197The driver should update the current_state field in its pci_dev structure in
198this function, except for PM-capable devices when pci_set_power_state is used.
199
200resume
201------
202
203Usage:
204
205if (dev->driver && dev->driver->resume)
206 dev->driver->resume(dev)
207 771
208The resume callback may be called from any power state, and is always meant to 7723.1.5. freeze_noirq()
209transition the device to the D0 state.
210 773
211The driver is responsible for reenabling any features of the device that had 774The freeze_noirq() callback is hibernation-specific. It is executed during
212been disabled during previous suspend calls, such as IRQs and bus mastering, 775hibernation, after prepare() and freeze() callbacks have been executed for all
213as well as calling pci_restore_state(). 776devices in preparation for the creation of a system image, and during restore,
777after a system image has been loaded into memory and after prepare() and
778freeze() callbacks have been executed for all devices. It is always executed
779after device interrupts have been disabled by the PM core.
214 780
215If the device is currently in D3, it may need to be reinitialized in resume(). 781The role of this callback is analogous to the role of the suspend_noirq()
782callback described above and it very rarely is necessary to define
783freeze_noirq().
216 784
217 * Some types of devices, like bus controllers, will preserve context in D3hot 785The difference between freeze_noirq() and freeze() is analogous to the
218 (using Vcc power). Their drivers will often want to avoid re-initializing 786difference between suspend_noirq() and suspend().
219 them after re-entering D0 (perhaps to avoid resetting downstream devices).
220 787
221 * Other kinds of devices in D3hot will discard device context as part of a 7883.1.6. poweroff()
222 soft reset when re-entering the D0 state.
223
224 * Devices resuming from D3cold always go through a power-on reset. Some
225 device context can also be preserved using Vaux power.
226 789
227 * Some systems hide D3cold resume paths from drivers. For example, on PCs 790The poweroff() callback is hibernation-specific. It is executed when the system
228 the resume path for suspend-to-disk often runs BIOS powerup code, which 791is about to be powered off after saving a hibernation image to a persistent
229 will sometimes re-initialize the device. 792storage. prepare() callbacks are executed for all devices before poweroff() is
793called.
230 794
231To handle resets during D3 to D0 transitions, it may be convenient to share 795The role of this callback is analogous to the role of the suspend() and freeze()
232device initialization code between probe() and resume(). Device parameters 796callbacks described above, although it does not need to save the contents of
233can also be saved before the driver suspends into D3, avoiding re-probe. 797the device's registers. In particular, if the driver wants to put the device
798into a low-power state itself instead of allowing the PCI subsystem to do that,
799the poweroff() callback should use pci_prepare_to_sleep() and
800pci_set_power_state() to prepare the device for system wakeup and to put it
801into a low-power state, respectively, but it need not save the device's standard
802configuration registers.
234 803
235If the device supports the PCI PM Spec, it can use this to physically transition 8043.1.7. poweroff_noirq()
236the device to D0:
237 805
238pci_set_power_state(dev,0); 806The poweroff_noirq() callback is hibernation-specific. It is executed after
807poweroff() callbacks have been executed for all devices in the system.
239 808
240Note that if the entire system is transitioning out of a global sleep state, all 809The role of this callback is analogous to the role of the suspend_noirq() and
241devices will be placed in the D0 state, so this is not necessary. However, in 810freeze_noirq() callbacks described above, but it does not need to save the
242the event that the device is placed in the D3 state during normal operation, 811contents of the device's registers.
243this call is necessary. It is impossible to determine which of the two events is
244taking place in the driver, so it is always a good idea to make that call.
245 812
246The driver should take note of the state that it is resuming from in order to 813The difference between poweroff_noirq() and poweroff() is analogous to the
247ensure correct (and speedy) operation. 814difference between suspend_noirq() and suspend().
248 815
249The driver should update the current_state field in its pci_dev structure in 8163.1.8. resume_noirq()
250this function, except for PM-capable devices when pci_set_power_state is used.
251 817
818The resume_noirq() callback is only executed during system resume, after the
819PM core has enabled the non-boot CPUs. The driver's interrupt handler will not
820be invoked while resume_noirq() is running, so this callback can carry out
821operations that might race with the interrupt handler.
252 822
823Since the PCI subsystem unconditionally puts all devices into the full power
824state in the resume_noirq phase of system resume and restores their standard
825configuration registers, resume_noirq() is usually not necessary. In general
826it should only be used for performing operations that would lead to race
827conditions if carried out by resume().
253 828
254A reference implementation 8293.1.9. resume()
255-------------------------
256.suspend()
257{
258 /* driver specific operations */
259 830
260 /* Disable IRQ */ 831The resume() callback is only executed during system resume, after
261 free_irq(); 832resume_noirq() callbacks have been executed for all devices in the system and
262 /* If using MSI */ 833device interrupts have been enabled by the PM core.
263 pci_disable_msi();
264 834
265 pci_save_state(); 835This callback is responsible for restoring the pre-suspend configuration of the
266 pci_enable_wake(); 836device and bringing it back to the fully functional state. The device should be
267 /* Disable IO/bus master/irq router */ 837able to process I/O in a usual way after resume() has returned.
268 pci_disable_device();
269 pci_set_power_state(pci_choose_state());
270}
271 838
272.resume() 8393.1.10. thaw_noirq()
273{
274 pci_set_power_state(PCI_D0);
275 pci_restore_state();
276 /* device's irq possibly is changed, driver should take care */
277 pci_enable_device();
278 pci_set_master();
279 840
280 /* if using MSI, device's vector possibly is changed */ 841The thaw_noirq() callback is hibernation-specific. It is executed after a
281 pci_enable_msi(); 842system image has been created and the non-boot CPUs have been enabled by the PM
843core, in the thaw_noirq phase of hibernation. It also may be executed if the
844loading of a hibernation image fails during system restore (it is then executed
845after enabling the non-boot CPUs). The driver's interrupt handler will not be
846invoked while thaw_noirq() is running.
282 847
283 request_irq(); 848The role of this callback is analogous to the role of resume_noirq(). The
284 /* driver specific operations; */ 849difference between these two callbacks is that thaw_noirq() is executed after
285} 850freeze() and freeze_noirq(), so in general it does not need to modify the
851contents of the device's registers.
286 852
287This is a typical implementation. Drivers can slightly change the order 8533.1.11. thaw()
288of the operations in the implementation, ignore some operations or add
289more driver specific operations in it, but drivers should do something like
290this on the whole.
291 854
2925. Resources 855The thaw() callback is hibernation-specific. It is executed after thaw_noirq()
293~~~~~~~~~~~~ 856callbacks have been executed for all devices in the system and after device
857interrupts have been enabled by the PM core.
294 858
295PCI Local Bus Specification 859This callback is responsible for restoring the pre-freeze configuration of
296PCI Bus Power Management Interface Specification 860the device, so that it will work in a usual way after thaw() has returned.
297 861
298 http://www.pcisig.com 8623.1.12. restore_noirq()
299 863
864The restore_noirq() callback is hibernation-specific. It is executed in the
865restore_noirq phase of hibernation, when the boot kernel has passed control to
866the image kernel and the non-boot CPUs have been enabled by the image kernel's
867PM core.
868
869This callback is analogous to resume_noirq() with the exception that it cannot
870make any assumption on the previous state of the device, even if the BIOS (or
871generally the platform firmware) is known to preserve that state over a
872suspend-resume cycle.
873
874For the vast majority of PCI device drivers there is no difference between
875resume_noirq() and restore_noirq().
876
8773.1.13. restore()
878
879The restore() callback is hibernation-specific. It is executed after
880restore_noirq() callbacks have been executed for all devices in the system and
881after the PM core has enabled device drivers' interrupt handlers to be invoked.
882
883This callback is analogous to resume(), just like restore_noirq() is analogous
884to resume_noirq(). Consequently, the difference between restore_noirq() and
885restore() is analogous to the difference between resume_noirq() and resume().
886
887For the vast majority of PCI device drivers there is no difference between
888resume() and restore().
889
8903.1.14. complete()
891
892The complete() callback is executed in the following situations:
893 - during system resume, after resume() callbacks have been executed for all
894 devices,
895 - during hibernation, before saving the system image, after thaw() callbacks
896 have been executed for all devices,
897 - during system restore, when the system is going back to its pre-hibernation
898 state, after restore() callbacks have been executed for all devices.
899It also may be executed if the loading of a hibernation image into memory fails
900(in that case it is run after thaw() callbacks have been executed for all
901devices that have drivers in the boot kernel).
902
903This callback is entirely optional, although it may be necessary if the
904prepare() callback performs operations that need to be reversed.
905
9063.1.15. runtime_suspend()
907
908The runtime_suspend() callback is specific to device runtime power management
909(runtime PM). It is executed by the PM core's runtime PM framework when the
910device is about to be suspended (i.e. quiesced and put into a low-power state)
911at run time.
912
913This callback is responsible for freezing the device and preparing it to be
914put into a low-power state, but it must allow the PCI subsystem to perform all
915of the PCI-specific actions necessary for suspending the device.
916
9173.1.16. runtime_resume()
918
919The runtime_resume() callback is specific to device runtime PM. It is executed
920by the PM core's runtime PM framework when the device is about to be resumed
921(i.e. put into the full-power state and programmed to process I/O normally) at
922run time.
923
924This callback is responsible for restoring the normal functionality of the
925device after it has been put into the full-power state by the PCI subsystem.
926The device is expected to be able to process I/O in the usual way after
927runtime_resume() has returned.
928
9293.1.17. runtime_idle()
930
931The runtime_idle() callback is specific to device runtime PM. It is executed
932by the PM core's runtime PM framework whenever it may be desirable to suspend
933the device according to the PM core's information. In particular, it is
934automatically executed right after runtime_resume() has returned in case the
935resume of the device has happened as a result of a spurious event.
936
937This callback is optional, but if it is not implemented or if it returns 0, the
938PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
939cause the driver's runtime_suspend() callback to be executed.
940
9413.1.18. Pointing Multiple Callback Pointers to One Routine
942
943Although in principle each of the callbacks described in the previous
944subsections can be defined as a separate function, it often is convenient to
945point two or more members of struct dev_pm_ops to the same routine. There are
946a few convenience macros that can be used for this purpose.
947
948The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
949suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
950members and one resume routine pointed to by the .resume(), .thaw(), and
951.restore() members. The other function pointers in this struct dev_pm_ops are
952unset.
953
954The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
955additionally sets the .runtime_resume() pointer to the same value as
956.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
957the same value as .suspend() (and .freeze() and .poweroff()).
958
959The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
960dev_pm_ops to indicate that one suspend routine is to be pointed to by the
961.suspend(), .freeze(), and .poweroff() members and one resume routine is to
962be pointed to by the .resume(), .thaw(), and .restore() members.
963
9643.2. Device Runtime Power Management
965------------------------------------
966In addition to providing device power management callbacks PCI device drivers
967are responsible for controlling the runtime power management (runtime PM) of
968their devices.
969
970The PCI device runtime PM is optional, but it is recommended that PCI device
971drivers implement it at least in the cases where there is a reliable way of
972verifying that the device is not used (like when the network cable is detached
973from an Ethernet adapter or there are no devices attached to a USB controller).
974
975To support the PCI runtime PM the driver first needs to implement the
976runtime_suspend() and runtime_resume() callbacks. It also may need to implement
977the runtime_idle() callback to prevent the device from being suspended again
978every time right after the runtime_resume() callback has returned
979(alternatively, the runtime_suspend() callback will have to check if the
980device should really be suspended and return -EAGAIN if that is not the case).
981
982The runtime PM of PCI devices is disabled by default. It is also blocked by
983pci_pm_init() that runs the pm_runtime_forbid() helper function. If a PCI
984driver implements the runtime PM callbacks and intends to use the runtime PM
985framework provided by the PM core and the PCI subsystem, it should enable this
986feature by executing the pm_runtime_enable() helper function. However, the
987driver should not call the pm_runtime_allow() helper function unblocking
988the runtime PM of the device. Instead, it should allow user space or some
989platform-specific code to do that (user space can do it via sysfs), although
990once it has called pm_runtime_enable(), it must be prepared to handle the
991runtime PM of the device correctly as soon as pm_runtime_allow() is called
992(which may happen at any time). [It also is possible that user space causes
993pm_runtime_allow() to be called via sysfs before the driver is loaded, so in
994fact the driver has to be prepared to handle the runtime PM of the device as
995soon as it calls pm_runtime_enable().]
996
997The runtime PM framework works by processing requests to suspend or resume
998devices, or to check if they are idle (in which cases it is reasonable to
999subsequently request that they be suspended). These requests are represented
1000by work items put into the power management workqueue, pm_wq. Although there
1001are a few situations in which power management requests are automatically
1002queued by the PM core (for example, after processing a request to resume a
1003device the PM core automatically queues a request to check if the device is
1004idle), device drivers are generally responsible for queuing power management
1005requests for their devices. For this purpose they should use the runtime PM
1006helper functions provided by the PM core, discussed in
1007Documentation/power/runtime_pm.txt.
1008
1009Devices can also be suspended and resumed synchronously, without placing a
1010request into pm_wq. In the majority of cases this also is done by their
1011drivers that use helper functions provided by the PM core for this purpose.
1012
1013For more information on the runtime PM of devices refer to
1014Documentation/power/runtime_pm.txt.
1015
1016
10174. Resources
1018============
1019
1020PCI Local Bus Specification, Rev. 3.0
1021PCI Bus Power Management Interface Specification, Rev. 1.2
1022Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
1023PCI Express Base Specification, Rev. 2.0
1024Documentation/power/devices.txt
1025Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index c40866e8b957..bfed898a03fc 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters
18being runtime configurable or changeable from a driver was seen as too easy to 18being runtime configurable or changeable from a driver was seen as too easy to
19abuse. 19abuse.
20 20
21For each parameter a list of performance requirements is maintained along with 21For each parameter a list of performance requests is maintained along with
22an aggregated target value. The aggregated target value is updated with 22an aggregated target value. The aggregated target value is updated with
23changes to the requirement list or elements of the list. Typically the 23changes to the request list or elements of the list. Typically the
24aggregated target value is simply the max or min of the requirement values held 24aggregated target value is simply the max or min of the request values held
25in the parameter list elements. 25in the parameter list elements.
26 26
27From kernel mode the use of this interface is simple: 27From kernel mode the use of this interface is simple:
28pm_qos_add_requirement(param_id, name, target_value):
29Will insert a named element in the list for that identified PM_QOS parameter
30with the target value. Upon change to this list the new target is recomputed
31and any registered notifiers are called only if the target value is now
32different.
33 28
34pm_qos_update_requirement(param_id, name, new_target_value): 29handle = pm_qos_add_request(param_class, target_value):
35Will search the list identified by the param_id for the named list element and 30Will insert an element into the list for that identified PM_QOS class with the
36then update its target value, calling the notification tree if the aggregated 31target value. Upon change to this list the new target is recomputed and any
37target is changed. with that name is already registered. 32registered notifiers are called only if the target value is now different.
33Clients of pm_qos need to save the returned handle.
38 34
39pm_qos_remove_requirement(param_id, name): 35void pm_qos_update_request(handle, new_target_value):
40Will search the identified list for the named element and remove it, after 36Will update the list element pointed to by the handle with the new target value
41removal it will update the aggregate target and call the notification tree if 37and recompute the new aggregated target, calling the notification tree if the
42the target was changed as a result of removing the named requirement. 38target is changed.
39
40void pm_qos_remove_request(handle):
41Will remove the element. After removal it will update the aggregate target and
42call the notification tree if the target was changed as a result of removing
43the request.
43 44
44 45
45From user mode: 46From user mode:
46Only processes can register a pm_qos requirement. To provide for automatic 47Only processes can register a pm_qos request. To provide for automatic
47cleanup for process the interface requires the process to register its 48cleanup of a process, the interface requires the process to register its
48parameter requirements in the following way: 49parameter requests in the following way:
49 50
50To register the default pm_qos target for the specific parameter, the process 51To register the default pm_qos target for the specific parameter, the process
51must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] 52must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
52 53
53As long as the device node is held open that process has a registered 54As long as the device node is held open that process has a registered
54requirement on the parameter. The name of the requirement is "process_<PID>" 55request on the parameter.
55derived from the current->pid from within the open system call.
56 56
57To change the requested target value the process needs to write a s32 value to 57To change the requested target value the process needs to write an s32 value to
58the open device node. This translates to a pm_qos_update_requirement call. 58the open device node. Alternatively the user mode program could write a hex
59string for the value using 10 char long format e.g. "0x12345678". This
60translates to a pm_qos_update_request call.
59 61
60To remove the user mode request for a target value simply close the device 62To remove the user mode request for a target value simply close the device
61node. 63node.
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index cdebb5145c25..55c4175d8099 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -8,11 +8,11 @@ Please see overview.txt for a description of the terms used in this text.
81. Consumer Regulator Access (static & dynamic drivers) 81. Consumer Regulator Access (static & dynamic drivers)
9======================================================= 9=======================================================
10 10
11A consumer driver can get access to it's supply regulator by calling :- 11A consumer driver can get access to its supply regulator by calling :-
12 12
13regulator = regulator_get(dev, "Vcc"); 13regulator = regulator_get(dev, "Vcc");
14 14
15The consumer passes in it's struct device pointer and power supply ID. The core 15The consumer passes in its struct device pointer and power supply ID. The core
16then finds the correct regulator by consulting a machine specific lookup table. 16then finds the correct regulator by consulting a machine specific lookup table.
17If the lookup is successful then this call will return a pointer to the struct 17If the lookup is successful then this call will return a pointer to the struct
18regulator that supplies this consumer. 18regulator that supplies this consumer.
@@ -34,7 +34,7 @@ usually be called in your device drivers probe() and remove() respectively.
342. Regulator Output Enable & Disable (static & dynamic drivers) 342. Regulator Output Enable & Disable (static & dynamic drivers)
35==================================================================== 35====================================================================
36 36
37A consumer can enable it's power supply by calling:- 37A consumer can enable its power supply by calling:-
38 38
39int regulator_enable(regulator); 39int regulator_enable(regulator);
40 40
@@ -49,7 +49,7 @@ int regulator_is_enabled(regulator);
49This will return > zero when the regulator is enabled. 49This will return > zero when the regulator is enabled.
50 50
51 51
52A consumer can disable it's supply when no longer needed by calling :- 52A consumer can disable its supply when no longer needed by calling :-
53 53
54int regulator_disable(regulator); 54int regulator_disable(regulator);
55 55
@@ -140,7 +140,7 @@ by calling :-
140int regulator_set_optimum_mode(struct regulator *regulator, int load_uA); 140int regulator_set_optimum_mode(struct regulator *regulator, int load_uA);
141 141
142This will cause the core to recalculate the total load on the regulator (based 142This will cause the core to recalculate the total load on the regulator (based
143on all it's consumers) and change operating mode (if necessary and permitted) 143on all its consumers) and change operating mode (if necessary and permitted)
144to best match the current operating load. 144to best match the current operating load.
145 145
146The load_uA value can be determined from the consumers datasheet. e.g.most 146The load_uA value can be determined from the consumers datasheet. e.g.most
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index 63728fed620b..bdec39b9bd75 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt
@@ -52,7 +52,7 @@ static struct regulator_init_data regulator1_data = {
52}; 52};
53 53
54Regulator-1 supplies power to Regulator-2. This relationship must be registered 54Regulator-1 supplies power to Regulator-2. This relationship must be registered
55with the core so that Regulator-1 is also enabled when Consumer A enables it's 55with the core so that Regulator-1 is also enabled when Consumer A enables its
56supply (Regulator-2). The supply regulator is set by the supply_regulator_dev 56supply (Regulator-2). The supply regulator is set by the supply_regulator_dev
57field below:- 57field below:-
58 58
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index ffd185bb6054..9363e056188a 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -35,16 +35,16 @@ Some terms used in this document:-
35 o Consumer - Electronic device that is supplied power by a regulator. 35 o Consumer - Electronic device that is supplied power by a regulator.
36 Consumers can be classified into two types:- 36 Consumers can be classified into two types:-
37 37
38 Static: consumer does not change it's supply voltage or 38 Static: consumer does not change its supply voltage or
39 current limit. It only needs to enable or disable it's 39 current limit. It only needs to enable or disable it's
40 power supply. It's supply voltage is set by the hardware, 40 power supply. Its supply voltage is set by the hardware,
41 bootloader, firmware or kernel board initialisation code. 41 bootloader, firmware or kernel board initialisation code.
42 42
43 Dynamic: consumer needs to change it's supply voltage or 43 Dynamic: consumer needs to change it's supply voltage or
44 current limit to meet operation demands. 44 current limit to meet operation demands.
45 45
46 46
47 o Power Domain - Electronic circuit that is supplied it's input power by the 47 o Power Domain - Electronic circuit that is supplied its input power by the
48 output power of a regulator, switch or by another power 48 output power of a regulator, switch or by another power
49 domain. 49 domain.
50 50
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 356fd86f4ea8..55b859b3bc72 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -224,6 +224,12 @@ defined in include/linux/pm.h:
224 RPM_SUSPENDED, which means that each device is initially regarded by the 224 RPM_SUSPENDED, which means that each device is initially regarded by the
225 PM core as 'suspended', regardless of its real hardware status 225 PM core as 'suspended', regardless of its real hardware status
226 226
227 unsigned int runtime_auto;
228 - if set, indicates that the user space has allowed the device driver to
229 power manage the device at run time via the /sys/devices/.../power/control
230 interface; it may only be modified with the help of the pm_runtime_allow()
231 and pm_runtime_forbid() helper functions
232
227All of the above fields are members of the 'power' member of 'struct device'. 233All of the above fields are members of the 'power' member of 'struct device'.
228 234
2294. Run-time PM Device Helper Functions 2354. Run-time PM Device Helper Functions
@@ -250,7 +256,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
250 to suspend the device again in future 256 to suspend the device again in future
251 257
252 int pm_runtime_resume(struct device *dev); 258 int pm_runtime_resume(struct device *dev);
253 - execute the subsystem-leve resume callback for the device; returns 0 on 259 - execute the subsystem-level resume callback for the device; returns 0 on
254 success, 1 if the device's run-time PM status was already 'active' or 260 success, 1 if the device's run-time PM status was already 'active' or
255 error code on failure, where -EAGAIN means it may be safe to attempt to 261 error code on failure, where -EAGAIN means it may be safe to attempt to
256 resume the device again in future, but 'power.runtime_error' should be 262 resume the device again in future, but 'power.runtime_error' should be
@@ -329,6 +335,20 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
329 'power.runtime_error' is set or 'power.disable_depth' is greater than 335 'power.runtime_error' is set or 'power.disable_depth' is greater than
330 zero) 336 zero)
331 337
338 bool pm_runtime_suspended(struct device *dev);
339 - return true if the device's runtime PM status is 'suspended', or false
340 otherwise
341
342 void pm_runtime_allow(struct device *dev);
343 - set the power.runtime_auto flag for the device and decrease its usage
344 counter (used by the /sys/devices/.../power/control interface to
345 effectively allow the device to be power managed at run time)
346
347 void pm_runtime_forbid(struct device *dev);
348 - unset the power.runtime_auto flag for the device and increase its usage
349 counter (used by the /sys/devices/.../power/control interface to
350 effectively prevent the device from being power managed at run time)
351
332It is safe to execute the following helper functions from interrupt context: 352It is safe to execute the following helper functions from interrupt context:
333 353
334pm_request_idle() 354pm_request_idle()
@@ -382,6 +402,18 @@ may be desirable to suspend the device as soon as ->probe() or ->remove() has
382finished, so the PM core uses pm_runtime_idle_sync() to invoke the 402finished, so the PM core uses pm_runtime_idle_sync() to invoke the
383subsystem-level idle callback for the device at that time. 403subsystem-level idle callback for the device at that time.
384 404
405The user space can effectively disallow the driver of the device to power manage
406it at run time by changing the value of its /sys/devices/.../power/control
407attribute to "on", which causes pm_runtime_forbid() to be called. In principle,
408this mechanism may also be used by the driver to effectively turn off the
409run-time power management of the device until the user space turns it on.
410Namely, during the initialization the driver can make sure that the run-time PM
411status of the device is 'active' and call pm_runtime_forbid(). It should be
412noted, however, that if the user space has already intentionally changed the
413value of /sys/devices/.../power/control to "auto" to allow the driver to power
414manage the device at run time, the driver may confuse it by using
415pm_runtime_forbid() this way.
416
3856. Run-time PM and System Sleep 4176. Run-time PM and System Sleep
386 418
387Run-time PM and system sleep (i.e., system suspend and hibernation, also known 419Run-time PM and system sleep (i.e., system suspend and hibernation, also known
@@ -431,3 +463,64 @@ The PM core always increments the run-time usage counter before calling the
431->prepare() callback and decrements it after calling the ->complete() callback. 463->prepare() callback and decrements it after calling the ->complete() callback.
432Hence disabling run-time PM temporarily like this will not cause any run-time 464Hence disabling run-time PM temporarily like this will not cause any run-time
433suspend callbacks to be lost. 465suspend callbacks to be lost.
466
4677. Generic subsystem callbacks
468
469Subsystems may wish to conserve code space by using the set of generic power
470management callbacks provided by the PM core, defined in
471driver/base/power/generic_ops.c:
472
473 int pm_generic_runtime_idle(struct device *dev);
474 - invoke the ->runtime_idle() callback provided by the driver of this
475 device, if defined, and call pm_runtime_suspend() for this device if the
476 return value is 0 or the callback is not defined
477
478 int pm_generic_runtime_suspend(struct device *dev);
479 - invoke the ->runtime_suspend() callback provided by the driver of this
480 device and return its result, or return -EINVAL if not defined
481
482 int pm_generic_runtime_resume(struct device *dev);
483 - invoke the ->runtime_resume() callback provided by the driver of this
484 device and return its result, or return -EINVAL if not defined
485
486 int pm_generic_suspend(struct device *dev);
487 - if the device has not been suspended at run time, invoke the ->suspend()
488 callback provided by its driver and return its result, or return 0 if not
489 defined
490
491 int pm_generic_resume(struct device *dev);
492 - invoke the ->resume() callback provided by the driver of this device and,
493 if successful, change the device's runtime PM status to 'active'
494
495 int pm_generic_freeze(struct device *dev);
496 - if the device has not been suspended at run time, invoke the ->freeze()
497 callback provided by its driver and return its result, or return 0 if not
498 defined
499
500 int pm_generic_thaw(struct device *dev);
501 - if the device has not been suspended at run time, invoke the ->thaw()
502 callback provided by its driver and return its result, or return 0 if not
503 defined
504
505 int pm_generic_poweroff(struct device *dev);
506 - if the device has not been suspended at run time, invoke the ->poweroff()
507 callback provided by its driver and return its result, or return 0 if not
508 defined
509
510 int pm_generic_restore(struct device *dev);
511 - invoke the ->restore() callback provided by the driver of this device and,
512 if successful, change the device's runtime PM status to 'active'
513
514These functions can be assigned to the ->runtime_idle(), ->runtime_suspend(),
515->runtime_resume(), ->suspend(), ->resume(), ->freeze(), ->thaw(), ->poweroff(),
516or ->restore() callback pointers in the subsystem-level dev_pm_ops structures.
517
518If a subsystem wishes to use all of them at the same time, it can simply assign
519the GENERIC_SUBSYS_PM_OPS macro, defined in include/linux/pm.h, to its
520dev_pm_ops structure pointer.
521
522Device drivers that wish to use the same function as a system suspend, freeze,
523poweroff and run-time suspend callback, and similarly for system resume, thaw,
524restore, and run-time resume, can achieve this with the help of the
525UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
526last argument to NULL).
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index b967cd9137d6..81680f9f5909 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous
24reading and writing. It is also impossible to have the device open more than 24reading and writing. It is also impossible to have the device open more than
25once at a time. 25once at a time.
26 26
27Even opening the device has side effects. Data structures are
28allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
29called.
30
27The ioctl() commands recognized by the device are: 31The ioctl() commands recognized by the device are:
28 32
29SNAPSHOT_FREEZE - freeze user space processes (the current process is 33SNAPSHOT_FREEZE - freeze user space processes (the current process is
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 79f533f38c61..46d22105aa07 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1289,7 +1289,7 @@ link between a device node and its interrupt parent in
1289the interrupt tree. The value of interrupt-parent is the 1289the interrupt tree. The value of interrupt-parent is the
1290phandle of the parent node. 1290phandle of the parent node.
1291 1291
1292If the interrupt-parent property is not defined for a node, it's 1292If the interrupt-parent property is not defined for a node, its
1293interrupt parent is assumed to be an ancestor in the node's 1293interrupt parent is assumed to be an ancestor in the node's
1294_device tree_ hierarchy. 1294_device tree_ hierarchy.
1295 1295
diff --git a/Documentation/powerpc/dts-bindings/4xx/reboot.txt b/Documentation/powerpc/dts-bindings/4xx/reboot.txt
new file mode 100644
index 000000000000..d7217260589c
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/4xx/reboot.txt
@@ -0,0 +1,18 @@
1Reboot property to control system reboot on PPC4xx systems:
2
3By setting "reset_type" to one of the following values, the default
4software reset mechanism may be overidden. Here the possible values of
5"reset_type":
6
7 1 - PPC4xx core reset
8 2 - PPC4xx chip reset
9 3 - PPC4xx system reset (default)
10
11Example:
12
13 cpu@0 {
14 device_type = "cpu";
15 model = "PowerPC,440SPe";
16 ...
17 reset-type = <2>; /* Use chip-reset */
18 };
diff --git a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
index d015dcec4011..b0019eb5330e 100644
--- a/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/8xxx_gpio.txt
@@ -11,7 +11,7 @@ Required properties:
11 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx. 11 83xx, "fsl,mpc8572-gpio" for 85xx and "fsl,mpc8610-gpio" for 86xx.
12- #gpio-cells : Should be two. The first cell is the pin number and the 12- #gpio-cells : Should be two. The first cell is the pin number and the
13 second cell is used to specify optional parameters (currently unused). 13 second cell is used to specify optional parameters (currently unused).
14 - interrupts : Interrupt mapping for GPIO IRQ (currently unused). 14 - interrupts : Interrupt mapping for GPIO IRQ.
15 - interrupt-parent : Phandle for the interrupt controller that 15 - interrupt-parent : Phandle for the interrupt controller that
16 services interrupts for this device. 16 services interrupts for this device.
17- gpio-controller : Marks the port as GPIO controller. 17- gpio-controller : Marks the port as GPIO controller.
@@ -38,3 +38,23 @@ Example of gpio-controller nodes for a MPC8347 SoC:
38 38
39See booting-without-of.txt for details of how to specify GPIO 39See booting-without-of.txt for details of how to specify GPIO
40information for devices. 40information for devices.
41
42To use GPIO pins as interrupt sources for peripherals, specify the
43GPIO controller as the interrupt parent and define GPIO number +
44trigger mode using the interrupts property, which is defined like
45this:
46
47interrupts = <number trigger>, where:
48 - number: GPIO pin (0..31)
49 - trigger: trigger mode:
50 2 = trigger on falling edge
51 3 = trigger on both edges
52
53Example of device using this is:
54
55 funkyfpga@0 {
56 compatible = "funky-fpga";
57 ...
58 interrupts = <4 3>;
59 interrupt-parent = <&gpio1>;
60 };
diff --git a/Documentation/powerpc/dts-bindings/fsl/can.txt b/Documentation/powerpc/dts-bindings/fsl/can.txt
new file mode 100644
index 000000000000..2fa4fcd38fd6
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/can.txt
@@ -0,0 +1,53 @@
1CAN Device Tree Bindings
2------------------------
3
4(c) 2006-2009 Secret Lab Technologies Ltd
5Grant Likely <grant.likely@secretlab.ca>
6
7fsl,mpc5200-mscan nodes
8-----------------------
9In addition to the required compatible-, reg- and interrupt-properties, you can
10also specify which clock source shall be used for the controller:
11
12- fsl,mscan-clock-source : a string describing the clock source. Valid values
13 are: "ip" for ip bus clock
14 "ref" for reference clock (XTAL)
15 "ref" is default in case this property is not
16 present.
17
18fsl,mpc5121-mscan nodes
19-----------------------
20In addition to the required compatible-, reg- and interrupt-properties, you can
21also specify which clock source and divider shall be used for the controller:
22
23- fsl,mscan-clock-source : a string describing the clock source. Valid values
24 are: "ip" for ip bus clock
25 "ref" for reference clock
26 "sys" for system clock
27 If this property is not present, an optimal CAN
28 clock source and frequency based on the system
29 clock will be selected. If this is not possible,
30 the reference clock will be used.
31
32- fsl,mscan-clock-divider: for the reference and system clock, an additional
33 clock divider can be specified. By default, a
34 value of 1 is used.
35
36Note that the MPC5121 Rev. 1 processor is not supported.
37
38Examples:
39 can@1300 {
40 compatible = "fsl,mpc5121-mscan";
41 interrupts = <12 0x8>;
42 interrupt-parent = <&ipic>;
43 reg = <0x1300 0x80>;
44 };
45
46 can@1380 {
47 compatible = "fsl,mpc5121-mscan";
48 interrupts = <13 0x8>;
49 interrupt-parent = <&ipic>;
50 reg = <0x1380 0x80>;
51 fsl,mscan-clock-source = "ref";
52 fsl,mscan-clock-divider = <3>;
53 };
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
index 6e37be1eeb2d..4f8930263dd9 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
@@ -21,6 +21,15 @@ Required properties:
21- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the 21- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the
22 threads. 22 threads.
23 23
24Optional properties:
25- fsl,firmware-phandle:
26 Usage: required only if there is no fsl,qe-firmware child node
27 Value type: <phandle>
28 Definition: Points to a firmware node (see "QE Firmware Node" below)
29 that contains the firmware that should be uploaded for this QE.
30 The compatible property for the firmware node should say,
31 "fsl,qe-firmware".
32
24Recommended properties 33Recommended properties
25- brg-frequency : the internal clock source frequency for baud-rate 34- brg-frequency : the internal clock source frequency for baud-rate
26 generators in Hz. 35 generators in Hz.
@@ -59,3 +68,48 @@ Example:
59 reg = <0 c000>; 68 reg = <0 c000>;
60 }; 69 };
61 }; 70 };
71
72* QE Firmware Node
73
74This node defines a firmware binary that is embedded in the device tree, for
75the purpose of passing the firmware from bootloader to the kernel, or from
76the hypervisor to the guest.
77
78The firmware node itself contains the firmware binary contents, a compatible
79property, and any firmware-specific properties. The node should be placed
80inside a QE node that needs it. Doing so eliminates the need for a
81fsl,firmware-phandle property. Other QE nodes that need the same firmware
82should define an fsl,firmware-phandle property that points to the firmware node
83in the first QE node.
84
85The fsl,firmware property can be specified in the DTS (possibly using incbin)
86or can be inserted by the boot loader at boot time.
87
88Required properties:
89 - compatible
90 Usage: required
91 Value type: <string>
92 Definition: A standard property. Specify a string that indicates what
93 kind of firmware it is. For QE, this should be "fsl,qe-firmware".
94
95 - fsl,firmware
96 Usage: required
97 Value type: <prop-encoded-array>, encoded as an array of bytes
98 Definition: A standard property. This property contains the firmware
99 binary "blob".
100
101Example:
102 qe1@e0080000 {
103 compatible = "fsl,qe";
104 qe_firmware:qe-firmware {
105 compatible = "fsl,qe-firmware";
106 fsl,firmware = [0x70 0xcd 0x00 0x00 0x01 0x46 0x45 ...];
107 };
108 ...
109 };
110
111 qe2@e0090000 {
112 compatible = "fsl,qe";
113 fsl,firmware-phandle = <&qe_firmware>;
114 ...
115 };
diff --git a/Documentation/powerpc/dts-bindings/fsl/dma.txt b/Documentation/powerpc/dts-bindings/fsl/dma.txt
index 0732cdd05ba1..2a4b4bce6110 100644
--- a/Documentation/powerpc/dts-bindings/fsl/dma.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/dma.txt
@@ -44,21 +44,29 @@ Example:
44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
45 cell-index = <0>; 45 cell-index = <0>;
46 reg = <0 0x80>; 46 reg = <0 0x80>;
47 interrupt-parent = <&ipic>;
48 interrupts = <71 8>;
47 }; 49 };
48 dma-channel@80 { 50 dma-channel@80 {
49 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 51 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
50 cell-index = <1>; 52 cell-index = <1>;
51 reg = <0x80 0x80>; 53 reg = <0x80 0x80>;
54 interrupt-parent = <&ipic>;
55 interrupts = <71 8>;
52 }; 56 };
53 dma-channel@100 { 57 dma-channel@100 {
54 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 58 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
55 cell-index = <2>; 59 cell-index = <2>;
56 reg = <0x100 0x80>; 60 reg = <0x100 0x80>;
61 interrupt-parent = <&ipic>;
62 interrupts = <71 8>;
57 }; 63 };
58 dma-channel@180 { 64 dma-channel@180 {
59 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 65 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
60 cell-index = <3>; 66 cell-index = <3>;
61 reg = <0x180 0x80>; 67 reg = <0x180 0x80>;
68 interrupt-parent = <&ipic>;
69 interrupts = <71 8>;
62 }; 70 };
63 }; 71 };
64 72
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
index b6d2e21474f9..50da20310585 100644
--- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -2,15 +2,14 @@
2 2
3Required properties : 3Required properties :
4 4
5 - device_type : Should be "i2c"
6 - reg : Offset and length of the register set for the device 5 - reg : Offset and length of the register set for the device
6 - compatible : should be "fsl,CHIP-i2c" where CHIP is the name of a
7 compatible processor, e.g. mpc8313, mpc8543, mpc8544, mpc5121,
8 mpc5200 or mpc5200b. For the mpc5121, an additional node
9 "fsl,mpc5121-i2c-ctrl" is required as shown in the example below.
7 10
8Recommended properties : 11Recommended properties :
9 12
10 - compatible : compatibility list with 2 entries, the first should
11 be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor,
12 e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one
13 should be "fsl-i2c".
14 - interrupts : <a b> where a is the interrupt number and b is a 13 - interrupts : <a b> where a is the interrupt number and b is a
15 field that represents an encoding of the sense and level 14 field that represents an encoding of the sense and level
16 information for the interrupt. This should be encoded based on 15 information for the interrupt. This should be encoded based on
@@ -24,25 +23,40 @@ Recommended properties :
24 23
25Examples : 24Examples :
26 25
26 /* MPC5121 based board */
27 i2c@1740 {
28 #address-cells = <1>;
29 #size-cells = <0>;
30 compatible = "fsl,mpc5121-i2c", "fsl-i2c";
31 reg = <0x1740 0x20>;
32 interrupts = <11 0x8>;
33 interrupt-parent = <&ipic>;
34 clock-frequency = <100000>;
35 };
36
37 i2ccontrol@1760 {
38 compatible = "fsl,mpc5121-i2c-ctrl";
39 reg = <0x1760 0x8>;
40 };
41
42 /* MPC5200B based board */
27 i2c@3d00 { 43 i2c@3d00 {
28 #address-cells = <1>; 44 #address-cells = <1>;
29 #size-cells = <0>; 45 #size-cells = <0>;
30 compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c"; 46 compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c";
31 cell-index = <0>;
32 reg = <0x3d00 0x40>; 47 reg = <0x3d00 0x40>;
33 interrupts = <2 15 0>; 48 interrupts = <2 15 0>;
34 interrupt-parent = <&mpc5200_pic>; 49 interrupt-parent = <&mpc5200_pic>;
35 fsl,preserve-clocking; 50 fsl,preserve-clocking;
36 }; 51 };
37 52
53 /* MPC8544 base board */
38 i2c@3100 { 54 i2c@3100 {
39 #address-cells = <1>; 55 #address-cells = <1>;
40 #size-cells = <0>; 56 #size-cells = <0>;
41 cell-index = <1>;
42 compatible = "fsl,mpc8544-i2c", "fsl-i2c"; 57 compatible = "fsl,mpc8544-i2c", "fsl-i2c";
43 reg = <0x3100 0x100>; 58 reg = <0x3100 0x100>;
44 interrupts = <43 2>; 59 interrupts = <43 2>;
45 interrupt-parent = <&mpic>; 60 interrupt-parent = <&mpic>;
46 clock-frequency = <400000>; 61 clock-frequency = <400000>;
47 }; 62 };
48
diff --git a/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt b/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt
new file mode 100644
index 000000000000..8832e8798912
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/mpc5121-psc.txt
@@ -0,0 +1,70 @@
1MPC5121 PSC Device Tree Bindings
2
3PSC in UART mode
4----------------
5
6For PSC in UART mode the needed PSC serial devices
7are specified by fsl,mpc5121-psc-uart nodes in the
8fsl,mpc5121-immr SoC node. Additionally the PSC FIFO
9Controller node fsl,mpc5121-psc-fifo is requered there:
10
11fsl,mpc5121-psc-uart nodes
12--------------------------
13
14Required properties :
15 - compatible : Should contain "fsl,mpc5121-psc-uart" and "fsl,mpc5121-psc"
16 - cell-index : Index of the PSC in hardware
17 - reg : Offset and length of the register set for the PSC device
18 - interrupts : <a b> where a is the interrupt number of the
19 PSC FIFO Controller and b is a field that represents an
20 encoding of the sense and level information for the interrupt.
21 - interrupt-parent : the phandle for the interrupt controller that
22 services interrupts for this device.
23
24Recommended properties :
25 - fsl,rx-fifo-size : the size of the RX fifo slice (a multiple of 4)
26 - fsl,tx-fifo-size : the size of the TX fifo slice (a multiple of 4)
27
28
29fsl,mpc5121-psc-fifo node
30-------------------------
31
32Required properties :
33 - compatible : Should be "fsl,mpc5121-psc-fifo"
34 - reg : Offset and length of the register set for the PSC
35 FIFO Controller
36 - interrupts : <a b> where a is the interrupt number of the
37 PSC FIFO Controller and b is a field that represents an
38 encoding of the sense and level information for the interrupt.
39 - interrupt-parent : the phandle for the interrupt controller that
40 services interrupts for this device.
41
42
43Example for a board using PSC0 and PSC1 devices in serial mode:
44
45serial@11000 {
46 compatible = "fsl,mpc5121-psc-uart", "fsl,mpc5121-psc";
47 cell-index = <0>;
48 reg = <0x11000 0x100>;
49 interrupts = <40 0x8>;
50 interrupt-parent = < &ipic >;
51 fsl,rx-fifo-size = <16>;
52 fsl,tx-fifo-size = <16>;
53};
54
55serial@11100 {
56 compatible = "fsl,mpc5121-psc-uart", "fsl,mpc5121-psc";
57 cell-index = <1>;
58 reg = <0x11100 0x100>;
59 interrupts = <40 0x8>;
60 interrupt-parent = < &ipic >;
61 fsl,rx-fifo-size = <16>;
62 fsl,tx-fifo-size = <16>;
63};
64
65pscfifo@11f00 {
66 compatible = "fsl,mpc5121-psc-fifo";
67 reg = <0x11f00 0x100>;
68 interrupts = <40 0x8>;
69 interrupt-parent = < &ipic >;
70};
diff --git a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt
index 5c6602dbfdc2..4ccb2cd5df94 100644
--- a/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/mpc5200.txt
@@ -195,11 +195,4 @@ External interrupts:
195 195
196fsl,mpc5200-mscan nodes 196fsl,mpc5200-mscan nodes
197----------------------- 197-----------------------
198In addition to the required compatible-, reg- and interrupt-properites, you can 198See file can.txt in this directory.
199also specify which clock source shall be used for the controller:
200
201- fsl,mscan-clock-source- a string describing the clock source. Valid values
202 are: "ip" for ip bus clock
203 "ref" for reference clock (XTAL)
204 "ref" is default in case this property is not
205 present.
diff --git a/Documentation/powerpc/dts-bindings/fsl/spi.txt b/Documentation/powerpc/dts-bindings/fsl/spi.txt
index e7d9a344c4f4..80510c018eea 100644
--- a/Documentation/powerpc/dts-bindings/fsl/spi.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/spi.txt
@@ -13,6 +13,11 @@ Required properties:
13- interrupt-parent : the phandle for the interrupt controller that 13- interrupt-parent : the phandle for the interrupt controller that
14 services interrupts for this device. 14 services interrupts for this device.
15 15
16Optional properties:
17- gpios : specifies the gpio pins to be used for chipselects.
18 The gpios will be referred to as reg = <index> in the SPI child nodes.
19 If unspecified, a single SPI device without a chip select can be used.
20
16Example: 21Example:
17 spi@4c0 { 22 spi@4c0 {
18 cell-index = <0>; 23 cell-index = <0>;
@@ -21,4 +26,6 @@ Example:
21 interrupts = <82 0>; 26 interrupts = <82 0>;
22 interrupt-parent = <700>; 27 interrupt-parent = <700>;
23 mode = "cpu"; 28 mode = "cpu";
29 gpios = <&gpio 18 1 // device reg=<0>
30 &gpio 19 1>; // device reg=<1>
24 }; 31 };
diff --git a/Documentation/powerpc/dts-bindings/xilinx.txt b/Documentation/powerpc/dts-bindings/xilinx.txt
index ea68046bb9cb..299d0923537b 100644
--- a/Documentation/powerpc/dts-bindings/xilinx.txt
+++ b/Documentation/powerpc/dts-bindings/xilinx.txt
@@ -11,7 +11,7 @@
11 control how the core is synthesized. Historically, the EDK tool would 11 control how the core is synthesized. Historically, the EDK tool would
12 extract the device parameters relevant to device drivers and copy them 12 extract the device parameters relevant to device drivers and copy them
13 into an 'xparameters.h' in the form of #define symbols. This tells the 13 into an 'xparameters.h' in the form of #define symbols. This tells the
14 device drivers how the IP cores are configured, but it requres the kernel 14 device drivers how the IP cores are configured, but it requires the kernel
15 to be recompiled every time the FPGA bitstream is resynthesized. 15 to be recompiled every time the FPGA bitstream is resynthesized.
16 16
17 The new approach is to export the parameters into the device tree and 17 The new approach is to export the parameters into the device tree and
diff --git a/Documentation/powerpc/phyp-assisted-dump.txt b/Documentation/powerpc/phyp-assisted-dump.txt
index c4682b982a2e..ad340205d96a 100644
--- a/Documentation/powerpc/phyp-assisted-dump.txt
+++ b/Documentation/powerpc/phyp-assisted-dump.txt
@@ -19,7 +19,7 @@ dump offers several strong, practical advantages:
19 immediately available to the system for normal use. 19 immediately available to the system for normal use.
20-- After the dump is completed, no further reboots are 20-- After the dump is completed, no further reboots are
21 required; the system will be fully usable, and running 21 required; the system will be fully usable, and running
22 in it's normal, production mode on it normal kernel. 22 in its normal, production mode on its normal kernel.
23 23
24The above can only be accomplished by coordination with, 24The above can only be accomplished by coordination with,
25and assistance from the hypervisor. The procedure is 25and assistance from the hypervisor. The procedure is
diff --git a/Documentation/powerpc/ptrace.txt b/Documentation/powerpc/ptrace.txt
new file mode 100644
index 000000000000..f4a5499b7bc6
--- /dev/null
+++ b/Documentation/powerpc/ptrace.txt
@@ -0,0 +1,134 @@
1GDB intends to support the following hardware debug features of BookE
2processors:
3
44 hardware breakpoints (IAC)
52 hardware watchpoints (read, write and read-write) (DAC)
62 value conditions for the hardware watchpoints (DVC)
7
8For that, we need to extend ptrace so that GDB can query and set these
9resources. Since we're extending, we're trying to create an interface
10that's extendable and that covers both BookE and server processors, so
11that GDB doesn't need to special-case each of them. We added the
12following 3 new ptrace requests.
13
141. PTRACE_PPC_GETHWDEBUGINFO
15
16Query for GDB to discover the hardware debug features. The main info to
17be returned here is the minimum alignment for the hardware watchpoints.
18BookE processors don't have restrictions here, but server processors have
19an 8-byte alignment restriction for hardware watchpoints. We'd like to avoid
20adding special cases to GDB based on what it sees in AUXV.
21
22Since we're at it, we added other useful info that the kernel can return to
23GDB: this query will return the number of hardware breakpoints, hardware
24watchpoints and whether it supports a range of addresses and a condition.
25The query will fill the following structure provided by the requesting process:
26
27struct ppc_debug_info {
28 unit32_t version;
29 unit32_t num_instruction_bps;
30 unit32_t num_data_bps;
31 unit32_t num_condition_regs;
32 unit32_t data_bp_alignment;
33 unit32_t sizeof_condition; /* size of the DVC register */
34 uint64_t features; /* bitmask of the individual flags */
35};
36
37features will have bits indicating whether there is support for:
38
39#define PPC_DEBUG_FEATURE_INSN_BP_RANGE 0x1
40#define PPC_DEBUG_FEATURE_INSN_BP_MASK 0x2
41#define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4
42#define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8
43
442. PTRACE_SETHWDEBUG
45
46Sets a hardware breakpoint or watchpoint, according to the provided structure:
47
48struct ppc_hw_breakpoint {
49 uint32_t version;
50#define PPC_BREAKPOINT_TRIGGER_EXECUTE 0x1
51#define PPC_BREAKPOINT_TRIGGER_READ 0x2
52#define PPC_BREAKPOINT_TRIGGER_WRITE 0x4
53 uint32_t trigger_type; /* only some combinations allowed */
54#define PPC_BREAKPOINT_MODE_EXACT 0x0
55#define PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE 0x1
56#define PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE 0x2
57#define PPC_BREAKPOINT_MODE_MASK 0x3
58 uint32_t addr_mode; /* address match mode */
59
60#define PPC_BREAKPOINT_CONDITION_MODE 0x3
61#define PPC_BREAKPOINT_CONDITION_NONE 0x0
62#define PPC_BREAKPOINT_CONDITION_AND 0x1
63#define PPC_BREAKPOINT_CONDITION_EXACT 0x1 /* different name for the same thing as above */
64#define PPC_BREAKPOINT_CONDITION_OR 0x2
65#define PPC_BREAKPOINT_CONDITION_AND_OR 0x3
66#define PPC_BREAKPOINT_CONDITION_BE_ALL 0x00ff0000 /* byte enable bits */
67#define PPC_BREAKPOINT_CONDITION_BE(n) (1<<((n)+16))
68 uint32_t condition_mode; /* break/watchpoint condition flags */
69
70 uint64_t addr;
71 uint64_t addr2;
72 uint64_t condition_value;
73};
74
75A request specifies one event, not necessarily just one register to be set.
76For instance, if the request is for a watchpoint with a condition, both the
77DAC and DVC registers will be set in the same request.
78
79With this GDB can ask for all kinds of hardware breakpoints and watchpoints
80that the BookE supports. COMEFROM breakpoints available in server processors
81are not contemplated, but that is out of the scope of this work.
82
83ptrace will return an integer (handle) uniquely identifying the breakpoint or
84watchpoint just created. This integer will be used in the PTRACE_DELHWDEBUG
85request to ask for its removal. Return -ENOSPC if the requested breakpoint
86can't be allocated on the registers.
87
88Some examples of using the structure to:
89
90- set a breakpoint in the first breakpoint register
91
92 p.version = PPC_DEBUG_CURRENT_VERSION;
93 p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE;
94 p.addr_mode = PPC_BREAKPOINT_MODE_EXACT;
95 p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
96 p.addr = (uint64_t) address;
97 p.addr2 = 0;
98 p.condition_value = 0;
99
100- set a watchpoint which triggers on reads in the second watchpoint register
101
102 p.version = PPC_DEBUG_CURRENT_VERSION;
103 p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ;
104 p.addr_mode = PPC_BREAKPOINT_MODE_EXACT;
105 p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
106 p.addr = (uint64_t) address;
107 p.addr2 = 0;
108 p.condition_value = 0;
109
110- set a watchpoint which triggers only with a specific value
111
112 p.version = PPC_DEBUG_CURRENT_VERSION;
113 p.trigger_type = PPC_BREAKPOINT_TRIGGER_READ;
114 p.addr_mode = PPC_BREAKPOINT_MODE_EXACT;
115 p.condition_mode = PPC_BREAKPOINT_CONDITION_AND | PPC_BREAKPOINT_CONDITION_BE_ALL;
116 p.addr = (uint64_t) address;
117 p.addr2 = 0;
118 p.condition_value = (uint64_t) condition;
119
120- set a ranged hardware breakpoint
121
122 p.version = PPC_DEBUG_CURRENT_VERSION;
123 p.trigger_type = PPC_BREAKPOINT_TRIGGER_EXECUTE;
124 p.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
125 p.condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
126 p.addr = (uint64_t) begin_range;
127 p.addr2 = (uint64_t) end_range;
128 p.condition_value = 0;
129
1303. PTRACE_DELHWDEBUG
131
132Takes an integer which identifies an existing breakpoint or watchpoint
133(i.e., the value returned from PTRACE_SETHWDEBUG), and deletes the
134corresponding breakpoint or watchpoint..
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index aae8355d3166..221f38be98f4 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -190,3 +190,61 @@ Example:
190 for (node = rb_first(&mytree); node; node = rb_next(node)) 190 for (node = rb_first(&mytree); node; node = rb_next(node))
191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); 191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 192
193Support for Augmented rbtrees
194-----------------------------
195
196Augmented rbtree is an rbtree with "some" additional data stored in each node.
197This data can be used to augment some new functionality to rbtree.
198Augmented rbtree is an optional feature built on top of basic rbtree
199infrastructure. rbtree user who wants this feature will have an augment
200callback function in rb_root initialized.
201
202This callback function will be called from rbtree core routines whenever
203a node has a change in one or both of its children. It is the responsibility
204of the callback function to recalculate the additional data that is in the
205rb node using new children information. Note that if this new additional
206data affects the parent node's additional data, then callback function has
207to handle it and do the recursive updates.
208
209
210Interval tree is an example of augmented rb tree. Reference -
211"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
212More details about interval trees:
213
214Classical rbtree has a single key and it cannot be directly used to store
215interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
216lo:hi or to find whether there is an exact match for a new lo:hi.
217
218However, rbtree can be augmented to store such interval ranges in a structured
219way making it possible to do efficient lookup and exact match.
220
221This "extra information" stored in each node is the maximum hi
222(max_hi) value among all the nodes that are its descendents. This
223information can be maintained at each node just be looking at the node
224and its immediate children. And this will be used in O(log n) lookup
225for lowest match (lowest start address among all possible matches)
226with something like:
227
228find_lowest_match(lo, hi, node)
229{
230 lowest_match = NULL;
231 while (node) {
232 if (max_hi(node->left) > lo) {
233 // Lowest overlap if any must be on left side
234 node = node->left;
235 } else if (overlap(lo, hi, node)) {
236 lowest_match = node;
237 break;
238 } else if (lo > node->lo) {
239 // Lowest overlap if any must be on right side
240 node = node->right;
241 } else {
242 break;
243 }
244 }
245 return lowest_match;
246}
247
248Finding exact match will be to first find lowest match and then to follow
249successor nodes looking for exact match, until the start of a node is beyond
250the hi value we are looking for.
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index b4860509c319..83668e5dd17f 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -99,37 +99,15 @@ system. Also, it is possible to switch all rfkill drivers (or all drivers of
99a specified type) into a state which also updates the default state for 99a specified type) into a state which also updates the default state for
100hotplugged devices. 100hotplugged devices.
101 101
102After an application opens /dev/rfkill, it can read the current state of 102After an application opens /dev/rfkill, it can read the current state of all
103all devices, and afterwards can poll the descriptor for hotplug or state 103devices. Changes can be either obtained by either polling the descriptor for
104change events. 104hotplug or state change events or by listening for uevents emitted by the
105 105rfkill core framework.
106Applications must ignore operations (the "op" field) they do not handle, 106
107this allows the API to be extended in the future. 107Additionally, each rfkill device is registered in sysfs and emits uevents.
108 108
109Additionally, each rfkill device is registered in sysfs and there has the 109rfkill devices issue uevents (with an action of "change"), with the following
110following attributes: 110environment variables set:
111
112 name: Name assigned by driver to this key (interface or driver name).
113 type: Driver type string ("wlan", "bluetooth", etc).
114 persistent: Whether the soft blocked state is initialised from
115 non-volatile storage at startup.
116 state: Current state of the transmitter
117 0: RFKILL_STATE_SOFT_BLOCKED
118 transmitter is turned off by software
119 1: RFKILL_STATE_UNBLOCKED
120 transmitter is (potentially) active
121 2: RFKILL_STATE_HARD_BLOCKED
122 transmitter is forced off by something outside of
123 the driver's control.
124 This file is deprecated because it can only properly show
125 three of the four possible states, soft-and-hard-blocked is
126 missing.
127 claim: 0: Kernel handles events
128 This file is deprecated because there no longer is a way to
129 claim just control over a single rfkill instance.
130
131rfkill devices also issue uevents (with an action of "change"), with the
132following environment variables set:
133 111
134RFKILL_NAME 112RFKILL_NAME
135RFKILL_STATE 113RFKILL_STATE
@@ -137,3 +115,7 @@ RFKILL_TYPE
137 115
138The contents of these variables corresponds to the "name", "state" and 116The contents of these variables corresponds to the "name", "state" and
139"type" sysfs files explained above. 117"type" sysfs files explained above.
118
119
120For further details consult Documentation/ABI/stable/dev-rfkill and
121Documentation/ABI/stable/sysfs-class-rfkill.
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
index 4b736d24da7a..8df0b782c4d7 100644
--- a/Documentation/rt-mutex-design.txt
+++ b/Documentation/rt-mutex-design.txt
@@ -657,7 +657,7 @@ here.
657 657
658The waiter structure has a "task" field that points to the task that is blocked 658The waiter structure has a "task" field that points to the task that is blocked
659on the mutex. This field can be NULL the first time it goes through the loop 659on the mutex. This field can be NULL the first time it goes through the loop
660or if the task is a pending owner and had it's mutex stolen. If the "task" 660or if the task is a pending owner and had its mutex stolen. If the "task"
661field is NULL then we need to set up the accounting for it. 661field is NULL then we need to set up the accounting for it.
662 662
663Task blocks on mutex 663Task blocks on mutex
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
index 339207d11d95..d378cba66456 100644
--- a/Documentation/s390/CommonIO
+++ b/Documentation/s390/CommonIO
@@ -87,6 +87,12 @@ Command line parameters
87 compatibility, by the device number in hexadecimal (0xabcd or abcd). Device 87 compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
88 numbers given as 0xabcd will be interpreted as 0.0.abcd. 88 numbers given as 0xabcd will be interpreted as 0.0.abcd.
89 89
90* /proc/cio_settle
91
92 A write request to this file is blocked until all queued cio actions are
93 handled. This will allow userspace to wait for pending work affecting
94 device availability after changing cio_ignore or the hardware configuration.
95
90* For some of the information present in the /proc filesystem in 2.4 (namely, 96* For some of the information present in the /proc filesystem in 2.4 (namely,
91 /proc/subchannels and /proc/chpids), see driver-model.txt. 97 /proc/subchannels and /proc/chpids), see driver-model.txt.
92 Information formerly in /proc/irq_count is now in /proc/interrupts. 98 Information formerly in /proc/irq_count is now in /proc/interrupts.
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt
index bde473df748d..ed265cf54cde 100644
--- a/Documentation/s390/driver-model.txt
+++ b/Documentation/s390/driver-model.txt
@@ -223,8 +223,8 @@ touched by the driver - it should use the ccwgroup device's driver_data for its
223private data. 223private data.
224 224
225To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in 225To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
226mind that most drivers will need to implement both a ccwgroup and a ccw driver 226mind that most drivers will need to implement both a ccwgroup and a ccw
227(unless you have a meta ccw driver, like cu3088 for lcs and ctc). 227driver.
228 228
229 229
2302. Channel paths 2302. Channel paths
diff --git a/Documentation/s390/kvm.txt b/Documentation/s390/kvm.txt
index 6f5ceb0f09fc..85f3280d7ef6 100644
--- a/Documentation/s390/kvm.txt
+++ b/Documentation/s390/kvm.txt
@@ -102,7 +102,7 @@ args: unsigned long
102see also: include/linux/kvm.h 102see also: include/linux/kvm.h
103This ioctl stores the state of the cpu at the guest real address given as 103This ioctl stores the state of the cpu at the guest real address given as
104argument, unless one of the following values defined in include/linux/kvm.h 104argument, unless one of the following values defined in include/linux/kvm.h
105is given as arguement: 105is given as argument:
106KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in 106KVM_S390_STORE_STATUS_NOADDR - the CPU stores its status to the save area in
107absolute lowcore as defined by the principles of operation 107absolute lowcore as defined by the principles of operation
108KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in 108KVM_S390_STORE_STATUS_PREFIXED - the CPU stores its status to the save area in
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e2..8239ebbcddce 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be
211desirable to first provide fair CPU time to each user on the system and then to 211desirable to first provide fair CPU time to each user on the system and then to
212each task belonging to a user. 212each task belonging to a user.
213 213
214CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be 214CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be
215grouped and divides CPU time fairly among such groups. 215grouped and divides CPU time fairly among such groups.
216 216
217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and 217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and 220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
221SCHED_BATCH) tasks. 221SCHED_BATCH) tasks.
222 222
223At present, there are two (mutually exclusive) mechanisms to group tasks for 223 These options need CONFIG_CGROUPS to be defined, and let the administrator
224CPU bandwidth control purposes:
225
226 - Based on user id (CONFIG_USER_SCHED)
227
228 With this option, tasks are grouped according to their user id.
229
230 - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
231
232 This options needs CONFIG_CGROUPS to be defined, and lets the administrator
233 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See 224 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
234 Documentation/cgroups/cgroups.txt for more information about this filesystem. 225 Documentation/cgroups/cgroups.txt for more information about this filesystem.
235 226
236Only one of these options to group tasks can be chosen and not both. 227When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
237
238When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
239user and a "cpu_share" file is added in that directory.
240
241 # cd /sys/kernel/uids
242 # cat 512/cpu_share # Display user 512's CPU share
243 1024
244 # echo 2048 > 512/cpu_share # Modify user 512's CPU share
245 # cat 512/cpu_share # Display user 512's CPU share
246 2048
247 #
248
249CPU bandwidth between two users is divided in the ratio of their CPU shares.
250For example: if you would like user "root" to get twice the bandwidth of user
251"guest," then set the cpu_share for both the users such that "root"'s cpu_share
252is twice "guest"'s cpu_share.
253
254When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
255group created using the pseudo filesystem. See example steps below to create 228group created using the pseudo filesystem. See example steps below to create
256task groups and modify their CPU share using the "cgroups" pseudo filesystem. 229task groups and modify their CPU share using the "cgroups" pseudo filesystem.
257 230
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
273 246
274 # #Launch gmplayer (or your favourite movie player) 247 # #Launch gmplayer (or your favourite movie player)
275 # echo <movie_player_pid> > multimedia/tasks 248 # echo <movie_player_pid> > multimedia/tasks
276
2778. Implementation note: user namespaces
278
279User namespaces are intended to be hierarchical. But they are currently
280only partially implemented. Each of those has ramifications for CFS.
281
282First, since user namespaces are hierarchical, the /sys/kernel/uids
283presentation is inadequate. Eventually we will likely want to use sysfs
284tagging to provide private views of /sys/kernel/uids within each user
285namespace.
286
287Second, the hierarchical nature is intended to support completely
288unprivileged use of user namespaces. So if using user groups, then
289we want the users in a user namespace to be children of the user
290who created it.
291
292That is currently unimplemented. So instead, every user in a new
293user namespace will receive 1024 shares just like any user in the
294initial user namespace. Note that at the moment creation of a new
295user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
296CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c3419..605b0d40329d 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
1262.3 Basis for grouping tasks 1262.3 Basis for grouping tasks
127---------------------------- 127----------------------------
128 128
129There are two compile-time settings for allocating CPU bandwidth. These are 129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
130configured using the "Basis for grouping tasks" multiple choice menu under 130CPU bandwidth to task groups.
131General setup > Group CPU Scheduler:
132
133a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
134
135This lets you use the virtual files under
136"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
137each user .
138
139The other option is:
140
141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
142 131
143This uses the /cgroup virtual file system and 132This uses the /cgroup virtual file system and
144"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each 133"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
145control group instead. 134control group.
146 135
147For more information on working with control groups, you should read 136For more information on working with control groups, you should read
148Documentation/cgroups/cgroups.txt as well. 137Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
161=============== 150===============
162 151
163There is work in progress to make the scheduling period for each group 152There is work in progress to make the scheduling period for each group
164("/sys/kernel/uids/<uid>/cpu_rt_period_us" or 153("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
165"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
166 154
167The constraint on the period is that a subgroup must have a smaller or 155The constraint on the period is that a subgroup must have a smaller or
168equal period to its parent. But realistically its not very useful _yet_ 156equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc
index ff19a52fe004..e759e92e286d 100644
--- a/Documentation/scsi/ChangeLog.lpfc
+++ b/Documentation/scsi/ChangeLog.lpfc
@@ -707,7 +707,7 @@ Changes from 20040920 to 20041018
707 * Integrate patches from Christoph Hellwig: two new helpers common 707 * Integrate patches from Christoph Hellwig: two new helpers common
708 to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant 708 to lpfc_sli_resume_iocb and lpfc_sli_issue_iocb - singificant
709 cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is 709 cleanup of those two functions - the unused SLI_IOCB_USE_TXQ is
710 gone - lpfc_sli_issue_iocb_wait loses it's flags argument 710 gone - lpfc_sli_issue_iocb_wait loses its flags argument
711 totally. 711 totally.
712 * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit 712 * Fix in lpfc_sli.c: we can not store a 5 bit value in a 4-bit
713 field. 713 field.
@@ -989,8 +989,8 @@ Changes from 20040709 to 20040716
989 * Remove redundant port_cmp != 2 check in if 989 * Remove redundant port_cmp != 2 check in if
990 (!port_cmp) { .... if (port_cmp != 2).... } 990 (!port_cmp) { .... if (port_cmp != 2).... }
991 * Clock changes: removed struct clk_data and timerList. 991 * Clock changes: removed struct clk_data and timerList.
992 * Clock changes: seperate nodev_tmo and els_retry_delay into 2 992 * Clock changes: separate nodev_tmo and els_retry_delay into 2
993 seperate timers and convert to 1 argument changed 993 separate timers and convert to 1 argument changed
994 LPFC_NODE_FARP_PEND_t to struct lpfc_node_farp_pend convert 994 LPFC_NODE_FARP_PEND_t to struct lpfc_node_farp_pend convert
995 ipfarp_tmo to 1 argument convert target struct tmofunc and 995 ipfarp_tmo to 1 argument convert target struct tmofunc and
996 rtplunfunc to 1 argument * cr_count, cr_delay and 996 rtplunfunc to 1 argument * cr_count, cr_delay and
@@ -1028,7 +1028,7 @@ Changes from 20040614 to 20040709
1028 * Remove the need for buf_tmo. 1028 * Remove the need for buf_tmo.
1029 * Changed ULP_BDE64 to struct ulp_bde64. 1029 * Changed ULP_BDE64 to struct ulp_bde64.
1030 * Changed ULP_BDE to struct ulp_bde. 1030 * Changed ULP_BDE to struct ulp_bde.
1031 * Cleanup lpfc_os_return_scsi_cmd() and it's call path. 1031 * Cleanup lpfc_os_return_scsi_cmd() and its call path.
1032 * Removed lpfc_no_device_delay. 1032 * Removed lpfc_no_device_delay.
1033 * Consolidating lpfc_hba_put_event() into lpfc_put_event(). 1033 * Consolidating lpfc_hba_put_event() into lpfc_put_event().
1034 * Removed following attributes and their functionality: 1034 * Removed following attributes and their functionality:
@@ -1514,7 +1514,7 @@ Changes from 20040402 to 20040409
1514 * Remove unused elxclock declaration in elx_sli.h. 1514 * Remove unused elxclock declaration in elx_sli.h.
1515 * Since everywhere IOCB_ENTRY is used, the return value is cast, 1515 * Since everywhere IOCB_ENTRY is used, the return value is cast,
1516 move the cast into the macro. 1516 move the cast into the macro.
1517 * Split ioctls out into seperate files 1517 * Split ioctls out into separate files
1518 1518
1519Changes from 20040326 to 20040402 1519Changes from 20040326 to 20040402
1520 1520
@@ -1534,7 +1534,7 @@ Changes from 20040326 to 20040402
1534 * Unused variable cleanup 1534 * Unused variable cleanup
1535 * Use Linux list macros for DMABUF_t 1535 * Use Linux list macros for DMABUF_t
1536 * Break up ioctls into 3 sections, dfc, util, hbaapi 1536 * Break up ioctls into 3 sections, dfc, util, hbaapi
1537 rearranged code so this could be easily seperated into a 1537 rearranged code so this could be easily separated into a
1538 differnet module later All 3 are currently turned on by 1538 differnet module later All 3 are currently turned on by
1539 defines in lpfc_ioctl.c LPFC_DFC_IOCTL, LPFC_UTIL_IOCTL, 1539 defines in lpfc_ioctl.c LPFC_DFC_IOCTL, LPFC_UTIL_IOCTL,
1540 LPFC_HBAAPI_IOCTL 1540 LPFC_HBAAPI_IOCTL
@@ -1551,7 +1551,7 @@ Changes from 20040326 to 20040402
1551 started by lpfc_online(). lpfc_offline() only stopped 1551 started by lpfc_online(). lpfc_offline() only stopped
1552 els_timeout routine. It now stops all timeout routines 1552 els_timeout routine. It now stops all timeout routines
1553 associated with that hba. 1553 associated with that hba.
1554 * Replace seperate next and prev pointers in struct 1554 * Replace separate next and prev pointers in struct
1555 lpfc_bindlist with list_head type. In elxHBA_t, replace 1555 lpfc_bindlist with list_head type. In elxHBA_t, replace
1556 fc_nlpbind_start and _end with fc_nlpbind_list and use 1556 fc_nlpbind_start and _end with fc_nlpbind_list and use
1557 list_head macros to access it. 1557 list_head macros to access it.
diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas
index 17ffa0607712..30023568805e 100644
--- a/Documentation/scsi/ChangeLog.megaraid_sas
+++ b/Documentation/scsi/ChangeLog.megaraid_sas
@@ -1,3 +1,19 @@
11 Release Date : Thur. Oct 29, 2009 09:12:45 PST 2009 -
2 (emaild-id:megaraidlinux@lsi.com)
3 Bo Yang
4
52 Current Version : 00.00.04.17.1-rc1
63 Older Version : 00.00.04.12
7
81. Add the pad_0 in mfi frame structure to 0 to fix the
9 context value larger than 32bit value issue.
10
112. Add the logic drive list to the driver. Driver will
12 keep the logic drive list internal after driver load.
13
143. driver fixed the device update issue after get the AEN
15 PD delete/ADD, LD add/delete from FW.
16
11 Release Date : Tues. July 28, 2009 10:12:45 PST 2009 - 171 Release Date : Tues. July 28, 2009 10:12:45 PST 2009 -
2 (emaild-id:megaraidlinux@lsi.com) 18 (emaild-id:megaraidlinux@lsi.com)
3 Bo Yang 19 Bo Yang
diff --git a/Documentation/scsi/FlashPoint.txt b/Documentation/scsi/FlashPoint.txt
index d5acaa300a46..1540a92f6d2b 100644
--- a/Documentation/scsi/FlashPoint.txt
+++ b/Documentation/scsi/FlashPoint.txt
@@ -71,7 +71,7 @@ peters@mylex.com
71 71
72Ever since its introduction last October, the BusLogic FlashPoint LT has 72Ever since its introduction last October, the BusLogic FlashPoint LT has
73been problematic for members of the Linux community, in that no Linux 73been problematic for members of the Linux community, in that no Linux
74drivers have been available for this new Ultra SCSI product. Despite it's 74drivers have been available for this new Ultra SCSI product. Despite its
75officially being positioned as a desktop workstation product, and not being 75officially being positioned as a desktop workstation product, and not being
76particularly well suited for a high performance multitasking operating 76particularly well suited for a high performance multitasking operating
77system like Linux, the FlashPoint LT has been touted by computer system 77system like Linux, the FlashPoint LT has been touted by computer system
diff --git a/Documentation/scsi/dtc3x80.txt b/Documentation/scsi/dtc3x80.txt
index e8ae6230ab3e..1d7af9f9a8ed 100644
--- a/Documentation/scsi/dtc3x80.txt
+++ b/Documentation/scsi/dtc3x80.txt
@@ -12,7 +12,7 @@ The 3180 does not. Otherwise, they are identical.
12The DTC3x80 does not support DMA but it does have Pseudo-DMA which is 12The DTC3x80 does not support DMA but it does have Pseudo-DMA which is
13supported by the driver. 13supported by the driver.
14 14
15It's DTC406 scsi chip is supposedly compatible with the NCR 53C400. 15Its DTC406 scsi chip is supposedly compatible with the NCR 53C400.
16It is memory mapped, uses an IRQ, but no dma or io-port. There is 16It is memory mapped, uses an IRQ, but no dma or io-port. There is
17internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double 17internal DMA, between SCSI bus and an on-chip 128-byte buffer. Double
18buffering is done automagically by the chip. Data is transferred 18buffering is done automagically by the chip. Data is transferred
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 08e2b4d04aab..cda5f8fa2c66 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -1479,7 +1479,7 @@ Wide16 SCSI.
1479Enabling serial NVRAM support enables detection of the serial NVRAM included 1479Enabling serial NVRAM support enables detection of the serial NVRAM included
1480on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 1480on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
1481serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 1481serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
1482host adaptor and it's attached drives. 1482host adaptor and its attached drives.
1483 1483
1484The Symbios NVRAM also holds data on the boot order of host adaptors in a 1484The Symbios NVRAM also holds data on the boot order of host adaptors in a
1485system with more than one host adaptor. This enables the order of scanning 1485system with more than one host adaptor. This enables the order of scanning
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
index f536907e241d..2b21890bc983 100644
--- a/Documentation/scsi/osst.txt
+++ b/Documentation/scsi/osst.txt
@@ -40,7 +40,7 @@ behavior looks very much the same as st to the userspace applications.
40 40
41History 41History
42------- 42-------
43In the first place, osst shared it's identity very much with st. That meant 43In the first place, osst shared its identity very much with st. That meant
44that it used the same kernel structures and the same device node as st. 44that it used the same kernel structures and the same device node as st.
45So you could only have either of them being present in the kernel. This has 45So you could only have either of them being present in the kernel. This has
46been fixed by registering an own device, now. 46been fixed by registering an own device, now.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index aec6549ab097..e00192de4d1c 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -70,7 +70,7 @@ Overview:
70 up to an administrative entity controlling the vport. For example, 70 up to an administrative entity controlling the vport. For example,
71 if vports are to be associated with virtual machines, a XEN mgmt 71 if vports are to be associated with virtual machines, a XEN mgmt
72 utility would be responsible for creating wwpn/wwnn's for the vport, 72 utility would be responsible for creating wwpn/wwnn's for the vport,
73 using it's own naming authority and OUI. (Note: it already does this 73 using its own naming authority and OUI. (Note: it already does this
74 for virtual MAC addresses). 74 for virtual MAC addresses).
75 75
76 76
@@ -81,7 +81,7 @@ Device Trees and Vport Objects:
81 with rports and scsi target objects underneath it. Currently the FC 81 with rports and scsi target objects underneath it. Currently the FC
82 transport creates the vport object and places it under the scsi_host 82 transport creates the vport object and places it under the scsi_host
83 object corresponding to the physical adapter. The LLDD will allocate 83 object corresponding to the physical adapter. The LLDD will allocate
84 a new scsi_host for the vport and link it's object under the vport. 84 a new scsi_host for the vport and link its object under the vport.
85 The remainder of the tree under the vports scsi_host is the same 85 The remainder of the tree under the vports scsi_host is the same
86 as the non-NPIV case. The transport is written currently to easily 86 as the non-NPIV case. The transport is written currently to easily
87 allow the parent of the vport to be something other than the scsi_host. 87 allow the parent of the vport to be something other than the scsi_host.
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index eb9a7b905b64..6f63b7989679 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -687,7 +687,7 @@ maintain the driver code.
687Enabling serial NVRAM support enables detection of the serial NVRAM included 687Enabling serial NVRAM support enables detection of the serial NVRAM included
688on Symbios and some Symbios compatible host adaptors, and Tekram boards. The 688on Symbios and some Symbios compatible host adaptors, and Tekram boards. The
689serial NVRAM is used by Symbios and Tekram to hold set up parameters for the 689serial NVRAM is used by Symbios and Tekram to hold set up parameters for the
690host adaptor and it's attached drives. 690host adaptor and its attached drives.
691 691
692The Symbios NVRAM also holds data on the boot order of host adaptors in a 692The Symbios NVRAM also holds data on the boot order of host adaptors in a
693system with more than one host adaptor. This information is no longer used 693system with more than one host adaptor. This information is no longer used
diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt
index 5e5349a4fcd2..7c900507279f 100644
--- a/Documentation/serial/tty.txt
+++ b/Documentation/serial/tty.txt
@@ -105,6 +105,10 @@ write_wakeup() - May be called at any point between open and close.
105 is permitted to call the driver write method from 105 is permitted to call the driver write method from
106 this function. In such a situation defer it. 106 this function. In such a situation defer it.
107 107
108dcd_change() - Report to the tty line the current DCD pin status
109 changes and the relative timestamp. The timestamp
110 can be NULL.
111
108 112
109Driver Access 113Driver Access
110 114
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 8923597bd2bd..2075bbb8b3e2 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -227,6 +227,16 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
227 227
228 The power-management is supported. 228 The power-management is supported.
229 229
230 Module snd-asihpi
231 -----------------
232
233 Module for AudioScience ASI soundcards
234
235 enable_hpi_hwdep - enable HPI hwdep for AudioScience soundcard
236
237 This module supports multiple cards.
238 The driver requires the firmware loader support on kernel.
239
230 Module snd-atiixp 240 Module snd-atiixp
231 ----------------- 241 -----------------
232 242
@@ -482,6 +492,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
482 492
483 reference_rate - reference sample rate, 44100 or 48000 (default) 493 reference_rate - reference sample rate, 44100 or 48000 (default)
484 multiple - multiple to ref. sample rate, 1 or 2 (default) 494 multiple - multiple to ref. sample rate, 1 or 2 (default)
495 subsystem - override the PCI SSID for probing; the value
496 consists of SSVID << 16 | SSDID. The default is
497 zero, which means no override.
485 498
486 This module supports multiple cards. 499 This module supports multiple cards.
487 500
@@ -619,28 +632,23 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
619 632
620 The power-management is supported. 633 The power-management is supported.
621 634
622 Module snd-es968
623 ----------------
624
625 Module for sound cards based on ESS ES968 chip (PnP only).
626
627 This module supports multiple cards, PnP and autoprobe.
628
629 The power-management is supported.
630
631 Module snd-es1688 635 Module snd-es1688
632 ----------------- 636 -----------------
633 637
634 Module for ESS AudioDrive ES-1688 and ES-688 sound cards. 638 Module for ESS AudioDrive ES-1688 and ES-688 sound cards.
635 639
636 port - port # for ES-1688 chip (0x220,0x240,0x260) 640 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
637 fm_port - port # for OPL3 (option; share the same port as default)
638 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default) 641 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default)
639 irq - IRQ # for ES-1688 chip (5,7,9,10)
640 mpu_irq - IRQ # for MPU-401 port (5,7,9,10) 642 mpu_irq - IRQ # for MPU-401 port (5,7,9,10)
643 fm_port - port # for OPL3 (option; share the same port as default)
644
645 with isapnp=0, the following additional options are available:
646 port - port # for ES-1688 chip (0x220,0x240,0x260)
647 irq - IRQ # for ES-1688 chip (5,7,9,10)
641 dma8 - DMA # for ES-1688 chip (0,1,3) 648 dma8 - DMA # for ES-1688 chip (0,1,3)
642 649
643 This module supports multiple cards and autoprobe (without MPU-401 port). 650 This module supports multiple cards and autoprobe (without MPU-401 port)
651 and PnP with the ES968 chip.
644 652
645 Module snd-es18xx 653 Module snd-es18xx
646 ----------------- 654 -----------------
@@ -1123,6 +1131,21 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1123 1131
1124 This module supports multiple cards, autoprobe and ISA PnP. 1132 This module supports multiple cards, autoprobe and ISA PnP.
1125 1133
1134 Module snd-jazz16
1135 -------------------
1136
1137 Module for Media Vision Jazz16 chipset. The chipset consists of 3 chips:
1138 MVD1216 + MVA416 + MVA514.
1139
1140 port - port # for SB DSP chip (0x210,0x220,0x230,0x240,0x250,0x260)
1141 irq - IRQ # for SB DSP chip (3,5,7,9,10,15)
1142 dma8 - DMA # for SB DSP chip (1,3)
1143 dma16 - DMA # for SB DSP chip (5,7)
1144 mpu_port - MPU-401 port # (0x300,0x310,0x320,0x330)
1145 mpu_irq - MPU-401 irq # (2,3,5,7)
1146
1147 This module supports multiple cards.
1148
1126 Module snd-korg1212 1149 Module snd-korg1212
1127 ------------------- 1150 -------------------
1128 1151
@@ -1791,6 +1814,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1791 1814
1792 The power-management is supported. 1815 The power-management is supported.
1793 1816
1817 Module snd-ua101
1818 ----------------
1819
1820 Module for the Edirol UA-101/UA-1000 audio/MIDI interfaces.
1821
1822 This module supports multiple devices, autoprobe and hotplugging.
1823
1794 Module snd-usb-audio 1824 Module snd-usb-audio
1795 -------------------- 1825 --------------------
1796 1826
@@ -1923,7 +1953,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1923 ------------------- 1953 -------------------
1924 1954
1925 Module for sound cards based on the Asus AV100/AV200 chips, 1955 Module for sound cards based on the Asus AV100/AV200 chips,
1926 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), Essence ST 1956 i.e., Xonar D1, DX, D2, D2X, DS, HDAV1.3 (Deluxe), Essence ST
1927 (Deluxe) and Essence STX. 1957 (Deluxe) and Essence STX.
1928 1958
1929 This module supports autoprobe and multiple cards. 1959 This module supports autoprobe and multiple cards.
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index e72cee9e2a71..1d38b0dfba95 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -124,6 +124,8 @@ ALC882/883/885/888/889
124 asus-a7m ASUS A7M 124 asus-a7m ASUS A7M
125 macpro MacPro support 125 macpro MacPro support
126 mb5 Macbook 5,1 126 mb5 Macbook 5,1
127 macmini3 Macmini 3,1
128 mba21 Macbook Air 2,1
127 mbp3 Macbook Pro rev3 129 mbp3 Macbook Pro rev3
128 imac24 iMac 24'' with jack detection 130 imac24 iMac 24'' with jack detection
129 imac91 iMac 9,1 131 imac91 iMac 9,1
@@ -279,13 +281,16 @@ Conexant 5051
279 laptop Basic Laptop config (default) 281 laptop Basic Laptop config (default)
280 hp HP Spartan laptop 282 hp HP Spartan laptop
281 hp-dv6736 HP dv6736 283 hp-dv6736 HP dv6736
284 hp-f700 HP Compaq Presario F700
282 lenovo-x200 Lenovo X200 laptop 285 lenovo-x200 Lenovo X200 laptop
286 toshiba Toshiba Satellite M300
283 287
284Conexant 5066 288Conexant 5066
285============= 289=============
286 laptop Basic Laptop config (default) 290 laptop Basic Laptop config (default)
287 dell-laptop Dell laptops 291 dell-laptop Dell laptops
288 olpc-xo-1_5 OLPC XO 1.5 292 olpc-xo-1_5 OLPC XO 1.5
293 ideapad Lenovo IdeaPad U150
289 294
290STAC9200 295STAC9200
291======== 296========
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 6325bec06a72..bdafdbd32561 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -119,10 +119,18 @@ the codec slots 0 and 1 no matter what the hardware reports.
119 119
120Interrupt Handling 120Interrupt Handling
121~~~~~~~~~~~~~~~~~~ 121~~~~~~~~~~~~~~~~~~
122In rare but some cases, the interrupt isn't properly handled as 122HD-audio driver uses MSI as default (if available) since 2.6.33
123default. You would notice this by the DMA transfer error reported by 123kernel as MSI works better on some machines, and in general, it's
124ALSA PCM core, for example. Using MSI might help in such a case. 124better for performance. However, Nvidia controllers showed bad
125Pass `enable_msi=1` option for enabling MSI. 125regressions with MSI (especially in a combination with AMD chipset),
126thus we disabled MSI for them.
127
128There seem also still other devices that don't work with MSI. If you
129see a regression wrt the sound quality (stuttering, etc) or a lock-up
130in the recent kernel, try to pass `enable_msi=0` option to disable
131MSI. If it works, you can add the known bad device to the blacklist
132defined in hda_intel.c. In such a case, please report and give the
133patch back to the upstream developer.
126 134
127 135
128HD-AUDIO CODEC 136HD-AUDIO CODEC
@@ -196,7 +204,6 @@ generic parser regardless of the codec. Usually the codec-specific
196parser is much better than the generic parser (as now). Thus this 204parser is much better than the generic parser (as now). Thus this
197option is more about the debugging purpose. 205option is more about the debugging purpose.
198 206
199
200Speaker and Headphone Output 207Speaker and Headphone Output
201~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 208~~~~~~~~~~~~~~~~~~~~~~~~~~~~
202One of the most frequent (and obvious) bugs with HD-audio is the 209One of the most frequent (and obvious) bugs with HD-audio is the
@@ -452,6 +459,33 @@ Similarly, the lines after `[verb]` are parsed as `init_verbs`
452sysfs entries, and the lines after `[hint]` are parsed as `hints` 459sysfs entries, and the lines after `[hint]` are parsed as `hints`
453sysfs entries, respectively. 460sysfs entries, respectively.
454 461
462Another example to override the codec vendor id from 0x12345678 to
4630xdeadbeef is like below:
464------------------------------------------------------------------------
465 [codec]
466 0x12345678 0xabcd1234 2
467
468 [vendor_id]
469 0xdeadbeef
470------------------------------------------------------------------------
471
472In the similar way, you can override the codec subsystem_id via
473`[subsystem_id]`, the revision id via `[revision_id]` line.
474Also, the codec chip name can be rewritten via `[chip_name]` line.
475------------------------------------------------------------------------
476 [codec]
477 0x12345678 0xabcd1234 2
478
479 [subsystem_id]
480 0xffff1111
481
482 [revision_id]
483 0x10
484
485 [chip_name]
486 My-own NEWS-0002
487------------------------------------------------------------------------
488
455The hd-audio driver reads the file via request_firmware(). Thus, 489The hd-audio driver reads the file via request_firmware(). Thus,
456a patch file has to be located on the appropriate firmware path, 490a patch file has to be located on the appropriate firmware path,
457typically, /lib/firmware. For example, when you pass the option 491typically, /lib/firmware. For example, when you pass the option
@@ -565,6 +599,9 @@ probing, the proc file is available, so you can get the raw codec
565information before modified by the driver. Of course, the driver 599information before modified by the driver. Of course, the driver
566isn't usable with `probe_only=1`. But you can continue the 600isn't usable with `probe_only=1`. But you can continue the
567configuration via hwdep sysfs file if hda-reconfig option is enabled. 601configuration via hwdep sysfs file if hda-reconfig option is enabled.
602Using `probe_only` mask 2 skips the reset of HDA codecs (use
603`probe_only=3` as module option). The hwdep interface can be used
604to determine the BIOS codec initialization.
568 605
569 606
570hda-verb 607hda-verb
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 9ac842be9b4f..05bf5a0eee41 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -188,8 +188,8 @@ The WM8731 output mixer has 3 inputs (sources)
188 3. Mic Sidetone Input 188 3. Mic Sidetone Input
189 189
190Each input in this example has a kcontrol associated with it (defined in example 190Each input in this example has a kcontrol associated with it (defined in example
191above) and is connected to the output mixer via it's kcontrol name. We can now 191above) and is connected to the output mixer via its kcontrol name. We can now
192connect the destination widget (wrt audio signal) with it's source widgets. 192connect the destination widget (wrt audio signal) with its source widgets.
193 193
194 /* output mixer */ 194 /* output mixer */
195 {"Output Mixer", "Line Bypass Switch", "Line Input"}, 195 {"Output Mixer", "Line Bypass Switch", "Line Input"},
diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt
index bab7711ce963..2524c75557df 100644
--- a/Documentation/sound/alsa/soc/machine.txt
+++ b/Documentation/sound/alsa/soc/machine.txt
@@ -67,7 +67,7 @@ static struct snd_soc_dai_link corgi_dai = {
67 .ops = &corgi_ops, 67 .ops = &corgi_ops,
68}; 68};
69 69
70struct snd_soc_card then sets up the machine with it's DAIs. e.g. 70struct snd_soc_card then sets up the machine with its DAIs. e.g.
71 71
72/* corgi audio machine driver */ 72/* corgi audio machine driver */
73static struct snd_soc_card snd_soc_corgi = { 73static struct snd_soc_card snd_soc_corgi = {
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 1e4c6d3655f2..138ac88c1461 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -33,7 +33,7 @@ features :-
33 and machines. 33 and machines.
34 34
35 * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC 35 * Easy I2S/PCM audio interface setup between codec and SoC. Each SoC
36 interface and codec registers it's audio interface capabilities with the 36 interface and codec registers its audio interface capabilities with the
37 core and are subsequently matched and configured when the application 37 core and are subsequently matched and configured when the application
38 hardware parameters are known. 38 hardware parameters are known.
39 39
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 34c76a55bc04..9b659c79a547 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -54,12 +54,12 @@ Getting sparse
54~~~~~~~~~~~~~~ 54~~~~~~~~~~~~~~
55 55
56You can get latest released versions from the Sparse homepage at 56You can get latest released versions from the Sparse homepage at
57http://www.kernel.org/pub/linux/kernel/people/josh/sparse/ 57https://sparse.wiki.kernel.org/index.php/Main_Page
58 58
59Alternatively, you can get snapshots of the latest development version 59Alternatively, you can get snapshots of the latest development version
60of sparse using git to clone.. 60of sparse using git to clone..
61 61
62 git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git 62 git://git.kernel.org/pub/scm/devel/sparse/sparse.git
63 63
64DaveJ has hourly generated tarballs of the git tree available at.. 64DaveJ has hourly generated tarballs of the git tree available at..
65 65
diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi
new file mode 100644
index 000000000000..6325f5b48635
--- /dev/null
+++ b/Documentation/spi/ep93xx_spi
@@ -0,0 +1,95 @@
1Cirrus EP93xx SPI controller driver HOWTO
2=========================================
3
4ep93xx_spi driver brings SPI master support for EP93xx SPI controller. Chip
5selects are implemented with GPIO lines.
6
7NOTE: If possible, don't use SFRMOUT (SFRM1) signal as a chip select. It will
8not work correctly (it cannot be controlled by software). Use GPIO lines
9instead.
10
11Sample configuration
12====================
13
14Typically driver configuration is done in platform board files (the files under
15arch/arm/mach-ep93xx/*.c). In this example we configure MMC over SPI through
16this driver on TS-7260 board. You can adapt the code to suit your needs.
17
18This example uses EGPIO9 as SD/MMC card chip select (this is wired in DIO1
19header on the board).
20
21You need to select CONFIG_MMC_SPI to use mmc_spi driver.
22
23arch/arm/mach-ep93xx/ts72xx.c:
24
25...
26#include <linux/gpio.h>
27#include <linux/spi/spi.h>
28
29#include <mach/ep93xx_spi.h>
30
31/* this is our GPIO line used for chip select */
32#define MMC_CHIP_SELECT_GPIO EP93XX_GPIO_LINE_EGPIO9
33
34static int ts72xx_mmc_spi_setup(struct spi_device *spi)
35{
36 int err;
37
38 err = gpio_request(MMC_CHIP_SELECT_GPIO, spi->modalias);
39 if (err)
40 return err;
41
42 gpio_direction_output(MMC_CHIP_SELECT_GPIO, 1);
43
44 return 0;
45}
46
47static void ts72xx_mmc_spi_cleanup(struct spi_device *spi)
48{
49 gpio_set_value(MMC_CHIP_SELECT_GPIO, 1);
50 gpio_direction_input(MMC_CHIP_SELECT_GPIO);
51 gpio_free(MMC_CHIP_SELECT_GPIO);
52}
53
54static void ts72xx_mmc_spi_cs_control(struct spi_device *spi, int value)
55{
56 gpio_set_value(MMC_CHIP_SELECT_GPIO, value);
57}
58
59static struct ep93xx_spi_chip_ops ts72xx_mmc_spi_ops = {
60 .setup = ts72xx_mmc_spi_setup,
61 .cleanup = ts72xx_mmc_spi_cleanup,
62 .cs_control = ts72xx_mmc_spi_cs_control,
63};
64
65static struct spi_board_info ts72xx_spi_devices[] __initdata = {
66 {
67 .modalias = "mmc_spi",
68 .controller_data = &ts72xx_mmc_spi_ops,
69 /*
70 * We use 10 MHz even though the maximum is 7.4 MHz. The driver
71 * will limit it automatically to max. frequency.
72 */
73 .max_speed_hz = 10 * 1000 * 1000,
74 .bus_num = 0,
75 .chip_select = 0,
76 .mode = SPI_MODE_0,
77 },
78};
79
80static struct ep93xx_spi_info ts72xx_spi_info = {
81 .num_chipselect = ARRAY_SIZE(ts72xx_spi_devices),
82};
83
84static void __init ts72xx_init_machine(void)
85{
86 ...
87 ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices,
88 ARRAY_SIZE(ts72xx_spi_devices));
89}
90
91Thanks to
92=========
93Martin Guy, H. Hartley Sweeten and others who helped me during development of
94the driver. Simplemachines.it donated me a Sim.One board which I used testing
95the driver on EP9307.
diff --git a/Documentation/spi/spidev_fdx.c b/Documentation/spi/spidev_fdx.c
index fc354f760384..36ec0774ca0b 100644
--- a/Documentation/spi/spidev_fdx.c
+++ b/Documentation/spi/spidev_fdx.c
@@ -58,10 +58,10 @@ static void do_msg(int fd, int len)
58 len = sizeof buf; 58 len = sizeof buf;
59 59
60 buf[0] = 0xaa; 60 buf[0] = 0xaa;
61 xfer[0].tx_buf = (__u64) buf; 61 xfer[0].tx_buf = (unsigned long)buf;
62 xfer[0].len = 1; 62 xfer[0].len = 1;
63 63
64 xfer[1].rx_buf = (__u64) buf; 64 xfer[1].rx_buf = (unsigned long) buf;
65 xfer[1].len = len; 65 xfer[1].len = len;
66 66
67 status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer); 67 status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer);
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index 10abd3773e49..16feda901469 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -58,7 +58,7 @@ static void transfer(int fd)
58 }; 58 };
59 59
60 ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); 60 ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
61 if (ret == 1) 61 if (ret < 1)
62 pabort("can't send spi message"); 62 pabort("can't send spi message");
63 63
64 for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { 64 for (ret = 0; ret < ARRAY_SIZE(tx); ret++) {
diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt
index 5effa5bd993b..e213f45cf9d7 100644
--- a/Documentation/stable_kernel_rules.txt
+++ b/Documentation/stable_kernel_rules.txt
@@ -18,16 +18,15 @@ Rules on what kind of patches are accepted, and which ones are not, into the
18 - It cannot contain any "trivial" fixes in it (spelling changes, 18 - It cannot contain any "trivial" fixes in it (spelling changes,
19 whitespace cleanups, etc). 19 whitespace cleanups, etc).
20 - It must follow the Documentation/SubmittingPatches rules. 20 - It must follow the Documentation/SubmittingPatches rules.
21 - It or an equivalent fix must already exist in Linus' tree. Quote the 21 - It or an equivalent fix must already exist in Linus' tree (upstream).
22 respective commit ID in Linus' tree in your patch submission to -stable.
23 22
24 23
25Procedure for submitting patches to the -stable tree: 24Procedure for submitting patches to the -stable tree:
26 25
27 - Send the patch, after verifying that it follows the above rules, to 26 - Send the patch, after verifying that it follows the above rules, to
28 stable@kernel.org. 27 stable@kernel.org. You must note the upstream commit ID in the changelog
29 - To have the patch automatically included in the stable tree, add the 28 of your submission.
30 the tag 29 - To have the patch automatically included in the stable tree, add the tag
31 Cc: stable@kernel.org 30 Cc: stable@kernel.org
32 in the sign-off area. Once the patch is merged it will be applied to 31 in the sign-off area. Once the patch is merged it will be applied to
33 the stable tree without anything else needing to be done by the author 32 the stable tree without anything else needing to be done by the author
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index df38ef046f8d..cbd05ffc606b 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -84,6 +84,16 @@ netdev_max_backlog
84Maximum number of packets, queued on the INPUT side, when the interface 84Maximum number of packets, queued on the INPUT side, when the interface
85receives packets faster than kernel can process them. 85receives packets faster than kernel can process them.
86 86
87netdev_tstamp_prequeue
88----------------------
89
90If set to 0, RX packet timestamps can be sampled after RPS processing, when
91the target CPU processes packets. It might give some delay on timestamps, but
92permit to distribute the load on several cpus.
93
94If set to 1 (default), timestamps are sampled as soon as possible, before
95queueing.
96
87optmem_max 97optmem_max
88---------- 98----------
89 99
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index fc5790d36cd9..5fdbb612aeb8 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -19,6 +19,7 @@ files can be found in mm/swap.c.
19Currently, these files are in /proc/sys/vm: 19Currently, these files are in /proc/sys/vm:
20 20
21- block_dump 21- block_dump
22- compact_memory
22- dirty_background_bytes 23- dirty_background_bytes
23- dirty_background_ratio 24- dirty_background_ratio
24- dirty_bytes 25- dirty_bytes
@@ -26,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
26- dirty_ratio 27- dirty_ratio
27- dirty_writeback_centisecs 28- dirty_writeback_centisecs
28- drop_caches 29- drop_caches
30- extfrag_threshold
29- hugepages_treat_as_movable 31- hugepages_treat_as_movable
30- hugetlb_shm_group 32- hugetlb_shm_group
31- laptop_mode 33- laptop_mode
@@ -64,6 +66,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
64 66
65============================================================== 67==============================================================
66 68
69compact_memory
70
71Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
72all zones are compacted such that free memory is available in contiguous
73blocks where possible. This can be important for example in the allocation of
74huge pages although processes will also directly compact memory as required.
75
76==============================================================
77
67dirty_background_bytes 78dirty_background_bytes
68 79
69Contains the amount of dirty memory at which the pdflush background writeback 80Contains the amount of dirty memory at which the pdflush background writeback
@@ -139,6 +150,20 @@ user should run `sync' first.
139 150
140============================================================== 151==============================================================
141 152
153extfrag_threshold
154
155This parameter affects whether the kernel will compact memory or direct
156reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what
157the fragmentation index for each order is in each zone in the system. Values
158tending towards 0 imply allocations would fail due to lack of memory,
159values towards 1000 imply failures are due to fragmentation and -1 implies
160that the allocation will succeed as long as watermarks are met.
161
162The kernel will not compact memory in a zone if the
163fragmentation index is <= extfrag_threshold. The default value is 500.
164
165==============================================================
166
142hugepages_treat_as_movable 167hugepages_treat_as_movable
143 168
144This parameter is only useful when kernelcore= is specified at boot time to 169This parameter is only useful when kernelcore= is specified at boot time to
@@ -573,11 +598,14 @@ Because other nodes' memory may be free. This means system total status
573may be not fatal yet. 598may be not fatal yet.
574 599
575If this is set to 2, the kernel panics compulsorily even on the 600If this is set to 2, the kernel panics compulsorily even on the
576above-mentioned. 601above-mentioned. Even oom happens under memory cgroup, the whole
602system panics.
577 603
578The default value is 0. 604The default value is 0.
5791 and 2 are for failover of clustering. Please select either 6051 and 2 are for failover of clustering. Please select either
580according to your policy of failover. 606according to your policy of failover.
607panic_on_oom=2+kdump gives you very strong tool to investigate
608why oom happens. You can get snapshot.
581 609
582============================================================= 610=============================================================
583 611
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
index 5d8bc2cd250c..c1a1fd636bf9 100644
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -125,7 +125,7 @@ versions of the sysfs interface.
125- Block 125- Block
126 The converted block subsystem at /sys/class/block or 126 The converted block subsystem at /sys/class/block or
127 /sys/subsystem/block will contain the links for disks and partitions 127 /sys/subsystem/block will contain the links for disks and partitions
128 at the same level, never in a hierarchy. Assuming the block subsytem to 128 at the same level, never in a hierarchy. Assuming the block subsystem to
129 contain only disks and not partition devices in the same flat list is 129 contain only disks and not partition devices in the same flat list is
130 a bug in the application. 130 a bug in the application.
131 131
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index d56a01775423..5c17196c8fe9 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -177,13 +177,13 @@ virtual console (ALT+Fn) and then back again should also help.
177 177
178* I hit SysRq, but nothing seems to happen, what's wrong? 178* I hit SysRq, but nothing seems to happen, what's wrong?
179~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 179~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
180There are some keyboards that send different scancodes for SysRq than the 180There are some keyboards that produce a different keycode for SysRq than the
181pre-defined 0x54. So if SysRq doesn't work out of the box for a certain 181pre-defined value of 99 (see KEY_SYSRQ in include/linux/input.h), or which
182keyboard, run 'showkey -s' to find out the proper scancode sequence. Then 182don't have a SysRq key at all. In these cases, run 'showkey -s' to find an
183use 'setkeycodes <sequence> 84' to define this sequence to the usual SysRq 183appropriate scancode sequence, and use 'setkeycodes <sequence> 99' to map
184code (84 is decimal for 0x54). It's probably best to put this command in a 184this sequence to the usual SysRq code (e.g., 'setkeycodes e05b 99'). It's
185boot script. Oh, and by the way, you exit 'showkey' by not typing anything 185probably best to put this command in a boot script. Oh, and by the way, you
186for ten seconds. 186exit 'showkey' by not typing anything for ten seconds.
187 187
188* I want to add SysRQ key events to a module, how does it work? 188* I want to add SysRQ key events to a module, how does it work?
189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/timers/00-INDEX b/Documentation/timers/00-INDEX
index 397dc35e1323..a9248da5cdbc 100644
--- a/Documentation/timers/00-INDEX
+++ b/Documentation/timers/00-INDEX
@@ -4,6 +4,8 @@ highres.txt
4 - High resolution timers and dynamic ticks design notes 4 - High resolution timers and dynamic ticks design notes
5hpet.txt 5hpet.txt
6 - High Precision Event Timer Driver for Linux 6 - High Precision Event Timer Driver for Linux
7hpet_example.c
8 - sample hpet timer test program
7hrtimers.txt 9hrtimers.txt
8 - subsystem for high-resolution kernel timers 10 - subsystem for high-resolution kernel timers
9timer_stats.txt 11timer_stats.txt
diff --git a/Documentation/timers/Makefile b/Documentation/timers/Makefile
new file mode 100644
index 000000000000..c85625f4ab25
--- /dev/null
+++ b/Documentation/timers/Makefile
@@ -0,0 +1,8 @@
1# kbuild trick to avoid linker error. Can be omitted if a module is built.
2obj- := dummy.o
3
4# List of programs to build
5hostprogs-y := hpet_example
6
7# Tell kbuild to always build the programs
8always := $(hostprogs-y)
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt
index 16d25e6b5a00..767392ffd31e 100644
--- a/Documentation/timers/hpet.txt
+++ b/Documentation/timers/hpet.txt
@@ -26,274 +26,5 @@ initialization. An example of this initialization can be found in
26arch/x86/kernel/hpet.c. 26arch/x86/kernel/hpet.c.
27 27
28The driver provides a userspace API which resembles the API found in the 28The driver provides a userspace API which resembles the API found in the
29RTC driver framework. An example user space program is provided below. 29RTC driver framework. An example user space program is provided in
30 30file:Documentation/timers/hpet_example.c
31#include <stdio.h>
32#include <stdlib.h>
33#include <unistd.h>
34#include <fcntl.h>
35#include <string.h>
36#include <memory.h>
37#include <malloc.h>
38#include <time.h>
39#include <ctype.h>
40#include <sys/types.h>
41#include <sys/wait.h>
42#include <signal.h>
43#include <fcntl.h>
44#include <errno.h>
45#include <sys/time.h>
46#include <linux/hpet.h>
47
48
49extern void hpet_open_close(int, const char **);
50extern void hpet_info(int, const char **);
51extern void hpet_poll(int, const char **);
52extern void hpet_fasync(int, const char **);
53extern void hpet_read(int, const char **);
54
55#include <sys/poll.h>
56#include <sys/ioctl.h>
57#include <signal.h>
58
59struct hpet_command {
60 char *command;
61 void (*func)(int argc, const char ** argv);
62} hpet_command[] = {
63 {
64 "open-close",
65 hpet_open_close
66 },
67 {
68 "info",
69 hpet_info
70 },
71 {
72 "poll",
73 hpet_poll
74 },
75 {
76 "fasync",
77 hpet_fasync
78 },
79};
80
81int
82main(int argc, const char ** argv)
83{
84 int i;
85
86 argc--;
87 argv++;
88
89 if (!argc) {
90 fprintf(stderr, "-hpet: requires command\n");
91 return -1;
92 }
93
94
95 for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
96 if (!strcmp(argv[0], hpet_command[i].command)) {
97 argc--;
98 argv++;
99 fprintf(stderr, "-hpet: executing %s\n",
100 hpet_command[i].command);
101 hpet_command[i].func(argc, argv);
102 return 0;
103 }
104
105 fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
106
107 return -1;
108}
109
110void
111hpet_open_close(int argc, const char **argv)
112{
113 int fd;
114
115 if (argc != 1) {
116 fprintf(stderr, "hpet_open_close: device-name\n");
117 return;
118 }
119
120 fd = open(argv[0], O_RDONLY);
121 if (fd < 0)
122 fprintf(stderr, "hpet_open_close: open failed\n");
123 else
124 close(fd);
125
126 return;
127}
128
129void
130hpet_info(int argc, const char **argv)
131{
132}
133
134void
135hpet_poll(int argc, const char **argv)
136{
137 unsigned long freq;
138 int iterations, i, fd;
139 struct pollfd pfd;
140 struct hpet_info info;
141 struct timeval stv, etv;
142 struct timezone tz;
143 long usec;
144
145 if (argc != 3) {
146 fprintf(stderr, "hpet_poll: device-name freq iterations\n");
147 return;
148 }
149
150 freq = atoi(argv[1]);
151 iterations = atoi(argv[2]);
152
153 fd = open(argv[0], O_RDONLY);
154
155 if (fd < 0) {
156 fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
157 return;
158 }
159
160 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
161 fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
162 goto out;
163 }
164
165 if (ioctl(fd, HPET_INFO, &info) < 0) {
166 fprintf(stderr, "hpet_poll: failed to get info\n");
167 goto out;
168 }
169
170 fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
171
172 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
173 fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
174 goto out;
175 }
176
177 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
178 fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
179 goto out;
180 }
181
182 pfd.fd = fd;
183 pfd.events = POLLIN;
184
185 for (i = 0; i < iterations; i++) {
186 pfd.revents = 0;
187 gettimeofday(&stv, &tz);
188 if (poll(&pfd, 1, -1) < 0)
189 fprintf(stderr, "hpet_poll: poll failed\n");
190 else {
191 long data;
192
193 gettimeofday(&etv, &tz);
194 usec = stv.tv_sec * 1000000 + stv.tv_usec;
195 usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
196
197 fprintf(stderr,
198 "hpet_poll: expired time = 0x%lx\n", usec);
199
200 fprintf(stderr, "hpet_poll: revents = 0x%x\n",
201 pfd.revents);
202
203 if (read(fd, &data, sizeof(data)) != sizeof(data)) {
204 fprintf(stderr, "hpet_poll: read failed\n");
205 }
206 else
207 fprintf(stderr, "hpet_poll: data 0x%lx\n",
208 data);
209 }
210 }
211
212out:
213 close(fd);
214 return;
215}
216
217static int hpet_sigio_count;
218
219static void
220hpet_sigio(int val)
221{
222 fprintf(stderr, "hpet_sigio: called\n");
223 hpet_sigio_count++;
224}
225
226void
227hpet_fasync(int argc, const char **argv)
228{
229 unsigned long freq;
230 int iterations, i, fd, value;
231 sig_t oldsig;
232 struct hpet_info info;
233
234 hpet_sigio_count = 0;
235 fd = -1;
236
237 if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
238 fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
239 return;
240 }
241
242 if (argc != 3) {
243 fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
244 goto out;
245 }
246
247 fd = open(argv[0], O_RDONLY);
248
249 if (fd < 0) {
250 fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
251 return;
252 }
253
254
255 if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
256 ((value = fcntl(fd, F_GETFL)) == 1) ||
257 (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
258 fprintf(stderr, "hpet_fasync: fcntl failed\n");
259 goto out;
260 }
261
262 freq = atoi(argv[1]);
263 iterations = atoi(argv[2]);
264
265 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
266 fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
267 goto out;
268 }
269
270 if (ioctl(fd, HPET_INFO, &info) < 0) {
271 fprintf(stderr, "hpet_fasync: failed to get info\n");
272 goto out;
273 }
274
275 fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
276
277 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
278 fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
279 goto out;
280 }
281
282 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
283 fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
284 goto out;
285 }
286
287 for (i = 0; i < iterations; i++) {
288 (void) pause();
289 fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
290 }
291
292out:
293 signal(SIGIO, oldsig);
294
295 if (fd >= 0)
296 close(fd);
297
298 return;
299}
diff --git a/Documentation/timers/hpet_example.c b/Documentation/timers/hpet_example.c
new file mode 100644
index 000000000000..4bfafb7bc4c5
--- /dev/null
+++ b/Documentation/timers/hpet_example.c
@@ -0,0 +1,267 @@
1#include <stdio.h>
2#include <stdlib.h>
3#include <unistd.h>
4#include <fcntl.h>
5#include <string.h>
6#include <memory.h>
7#include <malloc.h>
8#include <time.h>
9#include <ctype.h>
10#include <sys/types.h>
11#include <sys/wait.h>
12#include <signal.h>
13#include <errno.h>
14#include <sys/time.h>
15#include <linux/hpet.h>
16
17
18extern void hpet_open_close(int, const char **);
19extern void hpet_info(int, const char **);
20extern void hpet_poll(int, const char **);
21extern void hpet_fasync(int, const char **);
22extern void hpet_read(int, const char **);
23
24#include <sys/poll.h>
25#include <sys/ioctl.h>
26
27struct hpet_command {
28 char *command;
29 void (*func)(int argc, const char ** argv);
30} hpet_command[] = {
31 {
32 "open-close",
33 hpet_open_close
34 },
35 {
36 "info",
37 hpet_info
38 },
39 {
40 "poll",
41 hpet_poll
42 },
43 {
44 "fasync",
45 hpet_fasync
46 },
47};
48
49int
50main(int argc, const char ** argv)
51{
52 int i;
53
54 argc--;
55 argv++;
56
57 if (!argc) {
58 fprintf(stderr, "-hpet: requires command\n");
59 return -1;
60 }
61
62
63 for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
64 if (!strcmp(argv[0], hpet_command[i].command)) {
65 argc--;
66 argv++;
67 fprintf(stderr, "-hpet: executing %s\n",
68 hpet_command[i].command);
69 hpet_command[i].func(argc, argv);
70 return 0;
71 }
72
73 fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
74
75 return -1;
76}
77
78void
79hpet_open_close(int argc, const char **argv)
80{
81 int fd;
82
83 if (argc != 1) {
84 fprintf(stderr, "hpet_open_close: device-name\n");
85 return;
86 }
87
88 fd = open(argv[0], O_RDONLY);
89 if (fd < 0)
90 fprintf(stderr, "hpet_open_close: open failed\n");
91 else
92 close(fd);
93
94 return;
95}
96
97void
98hpet_info(int argc, const char **argv)
99{
100}
101
102void
103hpet_poll(int argc, const char **argv)
104{
105 unsigned long freq;
106 int iterations, i, fd;
107 struct pollfd pfd;
108 struct hpet_info info;
109 struct timeval stv, etv;
110 struct timezone tz;
111 long usec;
112
113 if (argc != 3) {
114 fprintf(stderr, "hpet_poll: device-name freq iterations\n");
115 return;
116 }
117
118 freq = atoi(argv[1]);
119 iterations = atoi(argv[2]);
120
121 fd = open(argv[0], O_RDONLY);
122
123 if (fd < 0) {
124 fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
125 return;
126 }
127
128 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
129 fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
130 goto out;
131 }
132
133 if (ioctl(fd, HPET_INFO, &info) < 0) {
134 fprintf(stderr, "hpet_poll: failed to get info\n");
135 goto out;
136 }
137
138 fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
139
140 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
141 fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
142 goto out;
143 }
144
145 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
146 fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
147 goto out;
148 }
149
150 pfd.fd = fd;
151 pfd.events = POLLIN;
152
153 for (i = 0; i < iterations; i++) {
154 pfd.revents = 0;
155 gettimeofday(&stv, &tz);
156 if (poll(&pfd, 1, -1) < 0)
157 fprintf(stderr, "hpet_poll: poll failed\n");
158 else {
159 long data;
160
161 gettimeofday(&etv, &tz);
162 usec = stv.tv_sec * 1000000 + stv.tv_usec;
163 usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
164
165 fprintf(stderr,
166 "hpet_poll: expired time = 0x%lx\n", usec);
167
168 fprintf(stderr, "hpet_poll: revents = 0x%x\n",
169 pfd.revents);
170
171 if (read(fd, &data, sizeof(data)) != sizeof(data)) {
172 fprintf(stderr, "hpet_poll: read failed\n");
173 }
174 else
175 fprintf(stderr, "hpet_poll: data 0x%lx\n",
176 data);
177 }
178 }
179
180out:
181 close(fd);
182 return;
183}
184
185static int hpet_sigio_count;
186
187static void
188hpet_sigio(int val)
189{
190 fprintf(stderr, "hpet_sigio: called\n");
191 hpet_sigio_count++;
192}
193
194void
195hpet_fasync(int argc, const char **argv)
196{
197 unsigned long freq;
198 int iterations, i, fd, value;
199 sig_t oldsig;
200 struct hpet_info info;
201
202 hpet_sigio_count = 0;
203 fd = -1;
204
205 if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
206 fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
207 return;
208 }
209
210 if (argc != 3) {
211 fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
212 goto out;
213 }
214
215 fd = open(argv[0], O_RDONLY);
216
217 if (fd < 0) {
218 fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
219 return;
220 }
221
222
223 if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
224 ((value = fcntl(fd, F_GETFL)) == 1) ||
225 (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
226 fprintf(stderr, "hpet_fasync: fcntl failed\n");
227 goto out;
228 }
229
230 freq = atoi(argv[1]);
231 iterations = atoi(argv[2]);
232
233 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
234 fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
235 goto out;
236 }
237
238 if (ioctl(fd, HPET_INFO, &info) < 0) {
239 fprintf(stderr, "hpet_fasync: failed to get info\n");
240 goto out;
241 }
242
243 fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
244
245 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
246 fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
247 goto out;
248 }
249
250 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
251 fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
252 goto out;
253 }
254
255 for (i = 0; i < iterations; i++) {
256 (void) pause();
257 fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
258 }
259
260out:
261 signal(SIGIO, oldsig);
262
263 if (fd >= 0)
264 close(fd);
265
266 return;
267}
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2d..09bd8e902989 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
90 90
91 trace_event=[event-list] 91 trace_event=[event-list]
92 92
93The format of this boot option is the same as described in section 2.1. 93event-list is a comma separated list of events. See section 2.1 for event
94format.
94 95
953. Defining an event-enabled tracepoint 963. Defining an event-enabled tracepoint
96======================================= 97=======================================
@@ -238,7 +239,7 @@ subsystem's filter file.
238 239
239For convenience, filters for every event in a subsystem can be set or 240For convenience, filters for every event in a subsystem can be set or
240cleared as a group by writing a filter expression into the filter file 241cleared as a group by writing a filter expression into the filter file
241at the root of the subsytem. Note however, that if a filter for any 242at the root of the subsystem. Note however, that if a filter for any
242event within the subsystem lacks a field specified in the subsystem 243event within the subsystem lacks a field specified in the subsystem
243filter, or if the filter can't be applied for any other reason, the 244filter, or if the filter can't be applied for any other reason, the
244filter for that event will retain its previous setting. This can 245filter for that event will retain its previous setting. This can
@@ -250,7 +251,7 @@ fields can be guaranteed to propagate successfully to all events.
250Here are a few subsystem filter examples that also illustrate the 251Here are a few subsystem filter examples that also illustrate the
251above points: 252above points:
252 253
253Clear the filters on all events in the sched subsytem: 254Clear the filters on all events in the sched subsystem:
254 255
255# cd /sys/kernel/debug/tracing/events/sched 256# cd /sys/kernel/debug/tracing/events/sched
256# echo 0 > filter 257# echo 0 > filter
@@ -260,7 +261,7 @@ none
260none 261none
261 262
262Set a filter using only common fields for all events in the sched 263Set a filter using only common fields for all events in the sched
263subsytem (all events end up with the same filter): 264subsystem (all events end up with the same filter):
264 265
265# cd /sys/kernel/debug/tracing/events/sched 266# cd /sys/kernel/debug/tracing/events/sched
266# echo common_pid == 0 > filter 267# echo common_pid == 0 > filter
@@ -270,7 +271,7 @@ common_pid == 0
270common_pid == 0 271common_pid == 0
271 272
272Attempt to set a filter using a non-common field for all events in the 273Attempt to set a filter using a non-common field for all events in the
273sched subsytem (all events but those that have a prev_pid field retain 274sched subsystem (all events but those that have a prev_pid field retain
274their old filters): 275their old filters):
275 276
276# cd /sys/kernel/debug/tracing/events/sched 277# cd /sys/kernel/debug/tracing/events/sched
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 6a5a579126b0..f1f81afee8a0 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -238,11 +238,10 @@ HAVE_SYSCALL_TRACEPOINTS
238 238
239You need very few things to get the syscalls tracing in an arch. 239You need very few things to get the syscalls tracing in an arch.
240 240
241- Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig).
241- Have a NR_syscalls variable in <asm/unistd.h> that provides the number 242- Have a NR_syscalls variable in <asm/unistd.h> that provides the number
242 of syscalls supported by the arch. 243 of syscalls supported by the arch.
243- Implement arch_syscall_addr() that resolves a syscall address from a 244- Support the TIF_SYSCALL_TRACEPOINT thread flags.
244 syscall number.
245- Support the TIF_SYSCALL_TRACEPOINT thread flags
246- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace 245- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
247 in the ptrace syscalls tracing path. 246 in the ptrace syscalls tracing path.
248- Tag this arch as HAVE_SYSCALL_TRACEPOINTS. 247- Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index bab3040da548..557c1edeccaf 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
155 to be traced. Echoing names of functions into this file 155 to be traced. Echoing names of functions into this file
156 will limit the trace to only those functions. 156 will limit the trace to only those functions.
157 157
158 This interface also allows for commands to be used. See the
159 "Filter commands" section for more details.
160
158 set_ftrace_notrace: 161 set_ftrace_notrace:
159 162
160 This has an effect opposite to that of 163 This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1337can either use the sysctl function or set it via the proc system 1340can either use the sysctl function or set it via the proc system
1338interface. 1341interface.
1339 1342
1340 sysctl kernel.ftrace_dump_on_oops=1 1343 sysctl kernel.ftrace_dump_on_oops=n
1341 1344
1342or 1345or
1343 1346
1344 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops 1347 echo n > /proc/sys/kernel/ftrace_dump_on_oops
1345 1348
1349If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
1350only dump the buffer of the CPU that triggered the oops.
1346 1351
1347Here's an example of such a dump after a null pointer 1352Here's an example of such a dump after a null pointer
1348dereference in a kernel module: 1353dereference in a kernel module:
@@ -1588,7 +1593,7 @@ module author does not need to worry about it.
1588 1593
1589When tracing is enabled, kstop_machine is called to prevent 1594When tracing is enabled, kstop_machine is called to prevent
1590races with the CPUS executing code being modified (which can 1595races with the CPUS executing code being modified (which can
1591cause the CPU to do undesireable things), and the nops are 1596cause the CPU to do undesirable things), and the nops are
1592patched back to calls. But this time, they do not call mcount 1597patched back to calls. But this time, they do not call mcount
1593(which is just a function stub). They now call into the ftrace 1598(which is just a function stub). They now call into the ftrace
1594infrastructure. 1599infrastructure.
@@ -1822,6 +1827,47 @@ this special filter via:
1822 echo > set_graph_function 1827 echo > set_graph_function
1823 1828
1824 1829
1830Filter commands
1831---------------
1832
1833A few commands are supported by the set_ftrace_filter interface.
1834Trace commands have the following format:
1835
1836<function>:<command>:<parameter>
1837
1838The following commands are supported:
1839
1840- mod
1841 This command enables function filtering per module. The
1842 parameter defines the module. For example, if only the write*
1843 functions in the ext3 module are desired, run:
1844
1845 echo 'write*:mod:ext3' > set_ftrace_filter
1846
1847 This command interacts with the filter in the same way as
1848 filtering based on function names. Thus, adding more functions
1849 in a different module is accomplished by appending (>>) to the
1850 filter file. Remove specific module functions by prepending
1851 '!':
1852
1853 echo '!writeback*:mod:ext3' >> set_ftrace_filter
1854
1855- traceon/traceoff
1856 These commands turn tracing on and off when the specified
1857 functions are hit. The parameter determines how many times the
1858 tracing system is turned on and off. If unspecified, there is
1859 no limit. For example, to disable tracing when a schedule bug
1860 is hit the first 5 times, run:
1861
1862 echo '__schedule_bug:traceoff:5' > set_ftrace_filter
1863
1864 These commands are cumulative whether or not they are appended
1865 to set_ftrace_filter. To remove a command, prepend it by '!'
1866 and drop the parameter:
1867
1868 echo '!__schedule_bug:traceoff' > set_ftrace_filter
1869
1870
1825trace_pipe 1871trace_pipe
1826---------- 1872----------
1827 1873
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 47aabeebbdf6..ec94748ae65b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -24,6 +24,7 @@ Synopsis of kprobe_events
24------------------------- 24-------------------------
25 p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe 25 p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe
26 r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe 26 r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe
27 -:[GRP/]EVENT : Clear a probe
27 28
28 GRP : Group name. If omitted, use "kprobes" for it. 29 GRP : Group name. If omitted, use "kprobes" for it.
29 EVENT : Event name. If omitted, the event name is generated 30 EVENT : Event name. If omitted, the event name is generated
@@ -37,15 +38,14 @@ Synopsis of kprobe_events
37 @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol) 38 @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
38 $stackN : Fetch Nth entry of stack (N >= 0) 39 $stackN : Fetch Nth entry of stack (N >= 0)
39 $stack : Fetch stack address. 40 $stack : Fetch stack address.
40 $argN : Fetch function argument. (N >= 0)(*) 41 $retval : Fetch return value.(*)
41 $retval : Fetch return value.(**) 42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***) 43 NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
43 NAME=FETCHARG: Set NAME as the argument name of FETCHARG. 44 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
45 (u8/u16/u32/u64/s8/s16/s32/s64) are supported.
44 46
45 (*) aN may not correct on asmlinkaged functions and at the middle of 47 (*) only for return probe.
46 function body. 48 (**) this is useful for fetching a field of data structures.
47 (**) only for return probe.
48 (***) this is useful for fetching a field of data structures.
49 49
50 50
51Per-Probe Event Filtering 51Per-Probe Event Filtering
@@ -82,13 +82,16 @@ Usage examples
82To add a probe as a new event, write a new definition to kprobe_events 82To add a probe as a new event, write a new definition to kprobe_events
83as below. 83as below.
84 84
85 echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events 85 echo 'p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
86 86
87 This sets a kprobe on the top of do_sys_open() function with recording 87 This sets a kprobe on the top of do_sys_open() function with recording
881st to 4th arguments as "myprobe" event. As this example shows, users can 881st to 4th arguments as "myprobe" event. Note, which register/stack entry is
89choose more familiar names for each arguments. 89assigned to each function argument depends on arch-specific ABI. If you unsure
90the ABI, please try to use probe subcommand of perf-tools (you can find it
91under tools/perf/).
92As this example shows, users can choose more familiar names for each arguments.
90 93
91 echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events 94 echo 'r:myretprobe do_sys_open $retval' >> /sys/kernel/debug/tracing/kprobe_events
92 95
93 This sets a kretprobe on the return point of do_sys_open() function with 96 This sets a kretprobe on the return point of do_sys_open() function with
94recording return value as "myretprobe" event. 97recording return value as "myretprobe" event.
@@ -97,23 +100,24 @@ recording return value as "myretprobe" event.
97 100
98 cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format 101 cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
99name: myprobe 102name: myprobe
100ID: 75 103ID: 780
101format: 104format:
102 field:unsigned short common_type; offset:0; size:2; 105 field:unsigned short common_type; offset:0; size:2; signed:0;
103 field:unsigned char common_flags; offset:2; size:1; 106 field:unsigned char common_flags; offset:2; size:1; signed:0;
104 field:unsigned char common_preempt_count; offset:3; size:1; 107 field:unsigned char common_preempt_count; offset:3; size:1;signed:0;
105 field:int common_pid; offset:4; size:4; 108 field:int common_pid; offset:4; size:4; signed:1;
106 field:int common_tgid; offset:8; size:4; 109 field:int common_lock_depth; offset:8; size:4; signed:1;
107 110
108 field: unsigned long ip; offset:16;tsize:8; 111 field:unsigned long __probe_ip; offset:12; size:4; signed:0;
109 field: int nargs; offset:24;tsize:4; 112 field:int __probe_nargs; offset:16; size:4; signed:1;
110 field: unsigned long dfd; offset:32;tsize:8; 113 field:unsigned long dfd; offset:20; size:4; signed:0;
111 field: unsigned long filename; offset:40;tsize:8; 114 field:unsigned long filename; offset:24; size:4; signed:0;
112 field: unsigned long flags; offset:48;tsize:8; 115 field:unsigned long flags; offset:28; size:4; signed:0;
113 field: unsigned long mode; offset:56;tsize:8; 116 field:unsigned long mode; offset:32; size:4; signed:0;
114 117
115print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
116 118
119print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->__probe_ip,
120REC->dfd, REC->filename, REC->flags, REC->mode
117 121
118 You can see that the event has 4 arguments as in the expressions you specified. 122 You can see that the event has 4 arguments as in the expressions you specified.
119 123
@@ -121,6 +125,12 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R
121 125
122 This clears all probe points. 126 This clears all probe points.
123 127
128 Or,
129
130 echo -:myprobe >> kprobe_events
131
132 This clears probe points selectively.
133
124 Right after definition, each event is disabled by default. For tracing these 134 Right after definition, each event is disabled by default. For tracing these
125events, you need to enable it. 135events, you need to enable it.
126 136
@@ -146,4 +156,3 @@ events, you need to enable it.
146returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel 156returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
147returns from do_sys_open to sys_open+0x1b). 157returns from do_sys_open to sys_open+0x1b).
148 158
149
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
index c480e9c32dbd..4c5e37939344 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -381,7 +381,7 @@ descriptor that gives us the status of the transfer, its identification
381we issue another URB to read into the destination buffer the chunk of 381we issue another URB to read into the destination buffer the chunk of
382data coming out of the remote endpoint. Done, wait for the next guy. The 382data coming out of the remote endpoint. Done, wait for the next guy. The
383callbacks for the URBs issued from here are the ones that will declare 383callbacks for the URBs issued from here are the ones that will declare
384the xfer complete at some point and call it's callback. 384the xfer complete at some point and call its callback.
385 385
386Seems simple, but the implementation is not trivial. 386Seems simple, but the implementation is not trivial.
387 387
diff --git a/Documentation/usb/bulk-streams.txt b/Documentation/usb/bulk-streams.txt
new file mode 100644
index 000000000000..ffc02021863e
--- /dev/null
+++ b/Documentation/usb/bulk-streams.txt
@@ -0,0 +1,78 @@
1Background
2==========
3
4Bulk endpoint streams were added in the USB 3.0 specification. Streams allow a
5device driver to overload a bulk endpoint so that multiple transfers can be
6queued at once.
7
8Streams are defined in sections 4.4.6.4 and 8.12.1.4 of the Universal Serial Bus
93.0 specification at http://www.usb.org/developers/docs/ The USB Attached SCSI
10Protocol, which uses streams to queue multiple SCSI commands, can be found on
11the T10 website (http://t10.org/).
12
13
14Device-side implications
15========================
16
17Once a buffer has been queued to a stream ring, the device is notified (through
18an out-of-band mechanism on another endpoint) that data is ready for that stream
19ID. The device then tells the host which "stream" it wants to start. The host
20can also initiate a transfer on a stream without the device asking, but the
21device can refuse that transfer. Devices can switch between streams at any
22time.
23
24
25Driver implications
26===================
27
28int usb_alloc_streams(struct usb_interface *interface,
29 struct usb_host_endpoint **eps, unsigned int num_eps,
30 unsigned int num_streams, gfp_t mem_flags);
31
32Device drivers will call this API to request that the host controller driver
33allocate memory so the driver can use up to num_streams stream IDs. They must
34pass an array of usb_host_endpoints that need to be setup with similar stream
35IDs. This is to ensure that a UASP driver will be able to use the same stream
36ID for the bulk IN and OUT endpoints used in a Bi-directional command sequence.
37
38The return value is an error condition (if one of the endpoints doesn't support
39streams, or the xHCI driver ran out of memory), or the number of streams the
40host controller allocated for this endpoint. The xHCI host controller hardware
41declares how many stream IDs it can support, and each bulk endpoint on a
42SuperSpeed device will say how many stream IDs it can handle. Therefore,
43drivers should be able to deal with being allocated less stream IDs than they
44requested.
45
46Do NOT call this function if you have URBs enqueued for any of the endpoints
47passed in as arguments. Do not call this function to request less than two
48streams.
49
50Drivers will only be allowed to call this API once for the same endpoint
51without calling usb_free_streams(). This is a simplification for the xHCI host
52controller driver, and may change in the future.
53
54
55Picking new Stream IDs to use
56============================
57
58Stream ID 0 is reserved, and should not be used to communicate with devices. If
59usb_alloc_streams() returns with a value of N, you may use streams 1 though N.
60To queue an URB for a specific stream, set the urb->stream_id value. If the
61endpoint does not support streams, an error will be returned.
62
63Note that new API to choose the next stream ID will have to be added if the xHCI
64driver supports secondary stream IDs.
65
66
67Clean up
68========
69
70If a driver wishes to stop using streams to communicate with the device, it
71should call
72
73void usb_free_streams(struct usb_interface *interface,
74 struct usb_host_endpoint **eps, unsigned int num_eps,
75 gfp_t mem_flags);
76
77All stream IDs will be deallocated when the driver releases the interface, to
78ensure that drivers that don't support streams will be able to use the endpoint.
diff --git a/Documentation/usb/dma.txt b/Documentation/usb/dma.txt
index cfdcd16e3abf..84ef865237db 100644
--- a/Documentation/usb/dma.txt
+++ b/Documentation/usb/dma.txt
@@ -16,11 +16,11 @@ OR: they can now be DMA-aware.
16 manage dma mappings for existing dma-ready buffers (see below). 16 manage dma mappings for existing dma-ready buffers (see below).
17 17
18- URBs have an additional "transfer_dma" field, as well as a transfer_flags 18- URBs have an additional "transfer_dma" field, as well as a transfer_flags
19 bit saying if it's valid. (Control requests also have "setup_dma" and a 19 bit saying if it's valid. (Control requests also have "setup_dma", but
20 corresponding transfer_flags bit.) 20 drivers must not use it.)
21 21
22- "usbcore" will map those DMA addresses, if a DMA-aware driver didn't do 22- "usbcore" will map this DMA address, if a DMA-aware driver didn't do
23 it first and set URB_NO_TRANSFER_DMA_MAP or URB_NO_SETUP_DMA_MAP. HCDs 23 it first and set URB_NO_TRANSFER_DMA_MAP. HCDs
24 don't manage dma mappings for URBs. 24 don't manage dma mappings for URBs.
25 25
26- There's a new "generic DMA API", parts of which are usable by USB device 26- There's a new "generic DMA API", parts of which are usable by USB device
@@ -43,22 +43,16 @@ and effects like cache-trashing can impose subtle penalties.
43 kind of addresses to store in urb->transfer_buffer and urb->transfer_dma. 43 kind of addresses to store in urb->transfer_buffer and urb->transfer_dma.
44 You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags: 44 You'd also set URB_NO_TRANSFER_DMA_MAP in urb->transfer_flags:
45 45
46 void *usb_buffer_alloc (struct usb_device *dev, size_t size, 46 void *usb_alloc_coherent (struct usb_device *dev, size_t size,
47 int mem_flags, dma_addr_t *dma); 47 int mem_flags, dma_addr_t *dma);
48 48
49 void usb_buffer_free (struct usb_device *dev, size_t size, 49 void usb_free_coherent (struct usb_device *dev, size_t size,
50 void *addr, dma_addr_t dma); 50 void *addr, dma_addr_t dma);
51 51
52 Most drivers should *NOT* be using these primitives; they don't need 52 Most drivers should *NOT* be using these primitives; they don't need
53 to use this type of memory ("dma-coherent"), and memory returned from 53 to use this type of memory ("dma-coherent"), and memory returned from
54 kmalloc() will work just fine. 54 kmalloc() will work just fine.
55 55
56 For control transfers you can use the buffer primitives or not for each
57 of the transfer buffer and setup buffer independently. Set the flag bits
58 URB_NO_TRANSFER_DMA_MAP and URB_NO_SETUP_DMA_MAP to indicate which
59 buffers you have prepared. For non-control transfers URB_NO_SETUP_DMA_MAP
60 is ignored.
61
62 The memory buffer returned is "dma-coherent"; sometimes you might need to 56 The memory buffer returned is "dma-coherent"; sometimes you might need to
63 force a consistent memory access ordering by using memory barriers. It's 57 force a consistent memory access ordering by using memory barriers. It's
64 not using a streaming DMA mapping, so it's good for small transfers on 58 not using a streaming DMA mapping, so it's good for small transfers on
@@ -130,8 +124,8 @@ of Documentation/PCI/PCI-DMA-mapping.txt, titled "What memory is DMA-able?")
130 void usb_buffer_unmap (struct urb *urb); 124 void usb_buffer_unmap (struct urb *urb);
131 125
132 The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP 126 The calls manage urb->transfer_dma for you, and set URB_NO_TRANSFER_DMA_MAP
133 so that usbcore won't map or unmap the buffer. The same goes for 127 so that usbcore won't map or unmap the buffer. They cannot be used for
134 urb->setup_dma and URB_NO_SETUP_DMA_MAP for control requests. 128 setup_packet buffers in control requests.
135 129
136Note that several of those interfaces are currently commented out, since 130Note that several of those interfaces are currently commented out, since
137they don't have current users. See the source code. Other than the dmasync 131they don't have current users. See the source code. Other than the dmasync
diff --git a/Documentation/usb/error-codes.txt b/Documentation/usb/error-codes.txt
index 9cf83e8c27b8..d83703ea74b2 100644
--- a/Documentation/usb/error-codes.txt
+++ b/Documentation/usb/error-codes.txt
@@ -41,8 +41,8 @@ USB-specific:
41 41
42-EFBIG Host controller driver can't schedule that many ISO frames. 42-EFBIG Host controller driver can't schedule that many ISO frames.
43 43
44-EPIPE Specified endpoint is stalled. For non-control endpoints, 44-EPIPE The pipe type specified in the URB doesn't match the
45 reset this status with usb_clear_halt(). 45 endpoint's actual type.
46 46
47-EMSGSIZE (a) endpoint maxpacket size is zero; it is not usable 47-EMSGSIZE (a) endpoint maxpacket size is zero; it is not usable
48 in the current interface altsetting. 48 in the current interface altsetting.
@@ -60,6 +60,8 @@ USB-specific:
60 60
61-EHOSTUNREACH URB was rejected because the device is suspended. 61-EHOSTUNREACH URB was rejected because the device is suspended.
62 62
63-ENOEXEC A control URB doesn't contain a Setup packet.
64
63 65
64************************************************************************** 66**************************************************************************
65* Error codes returned by in urb->status * 67* Error codes returned by in urb->status *
diff --git a/Documentation/usb/gadget_hid.txt b/Documentation/usb/gadget_hid.txt
new file mode 100644
index 000000000000..f4a51f567427
--- /dev/null
+++ b/Documentation/usb/gadget_hid.txt
@@ -0,0 +1,445 @@
1
2 Linux USB HID gadget driver
3
4Introduction
5
6 The HID Gadget driver provides emulation of USB Human Interface
7 Devices (HID). The basic HID handling is done in the kernel,
8 and HID reports can be sent/received through I/O on the
9 /dev/hidgX character devices.
10
11 For more details about HID, see the developer page on
12 http://www.usb.org/developers/hidpage/
13
14Configuration
15
16 g_hid is a platform driver, so to use it you need to add
17 struct platform_device(s) to your platform code defining the
18 HID function descriptors you want to use - E.G. something
19 like:
20
21#include <linux/platform_device.h>
22#include <linux/usb/g_hid.h>
23
24/* hid descriptor for a keyboard */
25static struct hidg_func_descriptor my_hid_data = {
26 .subclass = 0, /* No subclass */
27 .protocol = 1, /* Keyboard */
28 .report_length = 8,
29 .report_desc_length = 63,
30 .report_desc = {
31 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
32 0x09, 0x06, /* USAGE (Keyboard) */
33 0xa1, 0x01, /* COLLECTION (Application) */
34 0x05, 0x07, /* USAGE_PAGE (Keyboard) */
35 0x19, 0xe0, /* USAGE_MINIMUM (Keyboard LeftControl) */
36 0x29, 0xe7, /* USAGE_MAXIMUM (Keyboard Right GUI) */
37 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
38 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
39 0x75, 0x01, /* REPORT_SIZE (1) */
40 0x95, 0x08, /* REPORT_COUNT (8) */
41 0x81, 0x02, /* INPUT (Data,Var,Abs) */
42 0x95, 0x01, /* REPORT_COUNT (1) */
43 0x75, 0x08, /* REPORT_SIZE (8) */
44 0x81, 0x03, /* INPUT (Cnst,Var,Abs) */
45 0x95, 0x05, /* REPORT_COUNT (5) */
46 0x75, 0x01, /* REPORT_SIZE (1) */
47 0x05, 0x08, /* USAGE_PAGE (LEDs) */
48 0x19, 0x01, /* USAGE_MINIMUM (Num Lock) */
49 0x29, 0x05, /* USAGE_MAXIMUM (Kana) */
50 0x91, 0x02, /* OUTPUT (Data,Var,Abs) */
51 0x95, 0x01, /* REPORT_COUNT (1) */
52 0x75, 0x03, /* REPORT_SIZE (3) */
53 0x91, 0x03, /* OUTPUT (Cnst,Var,Abs) */
54 0x95, 0x06, /* REPORT_COUNT (6) */
55 0x75, 0x08, /* REPORT_SIZE (8) */
56 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
57 0x25, 0x65, /* LOGICAL_MAXIMUM (101) */
58 0x05, 0x07, /* USAGE_PAGE (Keyboard) */
59 0x19, 0x00, /* USAGE_MINIMUM (Reserved) */
60 0x29, 0x65, /* USAGE_MAXIMUM (Keyboard Application) */
61 0x81, 0x00, /* INPUT (Data,Ary,Abs) */
62 0xc0 /* END_COLLECTION */
63 }
64};
65
66static struct platform_device my_hid = {
67 .name = "hidg",
68 .id = 0,
69 .num_resources = 0,
70 .resource = 0,
71 .dev.platform_data = &my_hid_data,
72};
73
74 You can add as many HID functions as you want, only limited by
75 the amount of interrupt endpoints your gadget driver supports.
76
77Send and receive HID reports
78
79 HID reports can be sent/received using read/write on the
80 /dev/hidgX character devices. See below for an example program
81 to do this.
82
83 hid_gadget_test is a small interactive program to test the HID
84 gadget driver. To use, point it at a hidg device and set the
85 device type (keyboard / mouse / joystick) - E.G.:
86
87 # hid_gadget_test /dev/hidg0 keyboard
88
89 You are now in the prompt of hid_gadget_test. You can type any
90 combination of options and values. Available options and
91 values are listed at program start. In keyboard mode you can
92 send up to six values.
93
94 For example type: g i s t r --left-shift
95
96 Hit return and the corresponding report will be sent by the
97 HID gadget.
98
99 Another interesting example is the caps lock test. Type
100 -–caps-lock and hit return. A report is then sent by the
101 gadget and you should receive the host answer, corresponding
102 to the caps lock LED status.
103
104 --caps-lock
105 recv report:2
106
107 With this command:
108
109 # hid_gadget_test /dev/hidg1 mouse
110
111 You can test the mouse emulation. Values are two signed numbers.
112
113
114Sample code
115
116/* hid_gadget_test */
117
118#include <pthread.h>
119#include <string.h>
120#include <stdio.h>
121#include <ctype.h>
122#include <fcntl.h>
123#include <errno.h>
124#include <stdio.h>
125#include <stdlib.h>
126#include <unistd.h>
127
128#define BUF_LEN 512
129
130struct options {
131 const char *opt;
132 unsigned char val;
133};
134
135static struct options kmod[] = {
136 {.opt = "--left-ctrl", .val = 0x01},
137 {.opt = "--right-ctrl", .val = 0x10},
138 {.opt = "--left-shift", .val = 0x02},
139 {.opt = "--right-shift", .val = 0x20},
140 {.opt = "--left-alt", .val = 0x04},
141 {.opt = "--right-alt", .val = 0x40},
142 {.opt = "--left-meta", .val = 0x08},
143 {.opt = "--right-meta", .val = 0x80},
144 {.opt = NULL}
145};
146
147static struct options kval[] = {
148 {.opt = "--return", .val = 0x28},
149 {.opt = "--esc", .val = 0x29},
150 {.opt = "--bckspc", .val = 0x2a},
151 {.opt = "--tab", .val = 0x2b},
152 {.opt = "--spacebar", .val = 0x2c},
153 {.opt = "--caps-lock", .val = 0x39},
154 {.opt = "--f1", .val = 0x3a},
155 {.opt = "--f2", .val = 0x3b},
156 {.opt = "--f3", .val = 0x3c},
157 {.opt = "--f4", .val = 0x3d},
158 {.opt = "--f5", .val = 0x3e},
159 {.opt = "--f6", .val = 0x3f},
160 {.opt = "--f7", .val = 0x40},
161 {.opt = "--f8", .val = 0x41},
162 {.opt = "--f9", .val = 0x42},
163 {.opt = "--f10", .val = 0x43},
164 {.opt = "--f11", .val = 0x44},
165 {.opt = "--f12", .val = 0x45},
166 {.opt = "--insert", .val = 0x49},
167 {.opt = "--home", .val = 0x4a},
168 {.opt = "--pageup", .val = 0x4b},
169 {.opt = "--del", .val = 0x4c},
170 {.opt = "--end", .val = 0x4d},
171 {.opt = "--pagedown", .val = 0x4e},
172 {.opt = "--right", .val = 0x4f},
173 {.opt = "--left", .val = 0x50},
174 {.opt = "--down", .val = 0x51},
175 {.opt = "--kp-enter", .val = 0x58},
176 {.opt = "--up", .val = 0x52},
177 {.opt = "--num-lock", .val = 0x53},
178 {.opt = NULL}
179};
180
181int keyboard_fill_report(char report[8], char buf[BUF_LEN], int *hold)
182{
183 char *tok = strtok(buf, " ");
184 int key = 0;
185 int i = 0;
186
187 for (; tok != NULL; tok = strtok(NULL, " ")) {
188
189 if (strcmp(tok, "--quit") == 0)
190 return -1;
191
192 if (strcmp(tok, "--hold") == 0) {
193 *hold = 1;
194 continue;
195 }
196
197 if (key < 6) {
198 for (i = 0; kval[i].opt != NULL; i++)
199 if (strcmp(tok, kval[i].opt) == 0) {
200 report[2 + key++] = kval[i].val;
201 break;
202 }
203 if (kval[i].opt != NULL)
204 continue;
205 }
206
207 if (key < 6)
208 if (islower(tok[0])) {
209 report[2 + key++] = (tok[0] - ('a' - 0x04));
210 continue;
211 }
212
213 for (i = 0; kmod[i].opt != NULL; i++)
214 if (strcmp(tok, kmod[i].opt) == 0) {
215 report[0] = report[0] | kmod[i].val;
216 break;
217 }
218 if (kmod[i].opt != NULL)
219 continue;
220
221 if (key < 6)
222 fprintf(stderr, "unknown option: %s\n", tok);
223 }
224 return 8;
225}
226
227static struct options mmod[] = {
228 {.opt = "--b1", .val = 0x01},
229 {.opt = "--b2", .val = 0x02},
230 {.opt = "--b3", .val = 0x04},
231 {.opt = NULL}
232};
233
234int mouse_fill_report(char report[8], char buf[BUF_LEN], int *hold)
235{
236 char *tok = strtok(buf, " ");
237 int mvt = 0;
238 int i = 0;
239 for (; tok != NULL; tok = strtok(NULL, " ")) {
240
241 if (strcmp(tok, "--quit") == 0)
242 return -1;
243
244 if (strcmp(tok, "--hold") == 0) {
245 *hold = 1;
246 continue;
247 }
248
249 for (i = 0; mmod[i].opt != NULL; i++)
250 if (strcmp(tok, mmod[i].opt) == 0) {
251 report[0] = report[0] | mmod[i].val;
252 break;
253 }
254 if (mmod[i].opt != NULL)
255 continue;
256
257 if (!(tok[0] == '-' && tok[1] == '-') && mvt < 2) {
258 errno = 0;
259 report[1 + mvt++] = (char)strtol(tok, NULL, 0);
260 if (errno != 0) {
261 fprintf(stderr, "Bad value:'%s'\n", tok);
262 report[1 + mvt--] = 0;
263 }
264 continue;
265 }
266
267 fprintf(stderr, "unknown option: %s\n", tok);
268 }
269 return 3;
270}
271
272static struct options jmod[] = {
273 {.opt = "--b1", .val = 0x10},
274 {.opt = "--b2", .val = 0x20},
275 {.opt = "--b3", .val = 0x40},
276 {.opt = "--b4", .val = 0x80},
277 {.opt = "--hat1", .val = 0x00},
278 {.opt = "--hat2", .val = 0x01},
279 {.opt = "--hat3", .val = 0x02},
280 {.opt = "--hat4", .val = 0x03},
281 {.opt = "--hatneutral", .val = 0x04},
282 {.opt = NULL}
283};
284
285int joystick_fill_report(char report[8], char buf[BUF_LEN], int *hold)
286{
287 char *tok = strtok(buf, " ");
288 int mvt = 0;
289 int i = 0;
290
291 *hold = 1;
292
293 /* set default hat position: neutral */
294 report[3] = 0x04;
295
296 for (; tok != NULL; tok = strtok(NULL, " ")) {
297
298 if (strcmp(tok, "--quit") == 0)
299 return -1;
300
301 for (i = 0; jmod[i].opt != NULL; i++)
302 if (strcmp(tok, jmod[i].opt) == 0) {
303 report[3] = (report[3] & 0xF0) | jmod[i].val;
304 break;
305 }
306 if (jmod[i].opt != NULL)
307 continue;
308
309 if (!(tok[0] == '-' && tok[1] == '-') && mvt < 3) {
310 errno = 0;
311 report[mvt++] = (char)strtol(tok, NULL, 0);
312 if (errno != 0) {
313 fprintf(stderr, "Bad value:'%s'\n", tok);
314 report[mvt--] = 0;
315 }
316 continue;
317 }
318
319 fprintf(stderr, "unknown option: %s\n", tok);
320 }
321 return 4;
322}
323
324void print_options(char c)
325{
326 int i = 0;
327
328 if (c == 'k') {
329 printf(" keyboard options:\n"
330 " --hold\n");
331 for (i = 0; kmod[i].opt != NULL; i++)
332 printf("\t\t%s\n", kmod[i].opt);
333 printf("\n keyboard values:\n"
334 " [a-z] or\n");
335 for (i = 0; kval[i].opt != NULL; i++)
336 printf("\t\t%-8s%s", kval[i].opt, i % 2 ? "\n" : "");
337 printf("\n");
338 } else if (c == 'm') {
339 printf(" mouse options:\n"
340 " --hold\n");
341 for (i = 0; mmod[i].opt != NULL; i++)
342 printf("\t\t%s\n", mmod[i].opt);
343 printf("\n mouse values:\n"
344 " Two signed numbers\n"
345 "--quit to close\n");
346 } else {
347 printf(" joystick options:\n");
348 for (i = 0; jmod[i].opt != NULL; i++)
349 printf("\t\t%s\n", jmod[i].opt);
350 printf("\n joystick values:\n"
351 " three signed numbers\n"
352 "--quit to close\n");
353 }
354}
355
356int main(int argc, const char *argv[])
357{
358 const char *filename = NULL;
359 int fd = 0;
360 char buf[BUF_LEN];
361 int cmd_len;
362 char report[8];
363 int to_send = 8;
364 int hold = 0;
365 fd_set rfds;
366 int retval, i;
367
368 if (argc < 3) {
369 fprintf(stderr, "Usage: %s devname mouse|keyboard|joystick\n",
370 argv[0]);
371 return 1;
372 }
373
374 if (argv[2][0] != 'k' && argv[2][0] != 'm' && argv[2][0] != 'j')
375 return 2;
376
377 filename = argv[1];
378
379 if ((fd = open(filename, O_RDWR, 0666)) == -1) {
380 perror(filename);
381 return 3;
382 }
383
384 print_options(argv[2][0]);
385
386 while (42) {
387
388 FD_ZERO(&rfds);
389 FD_SET(STDIN_FILENO, &rfds);
390 FD_SET(fd, &rfds);
391
392 retval = select(fd + 1, &rfds, NULL, NULL, NULL);
393 if (retval == -1 && errno == EINTR)
394 continue;
395 if (retval < 0) {
396 perror("select()");
397 return 4;
398 }
399
400 if (FD_ISSET(fd, &rfds)) {
401 cmd_len = read(fd, buf, BUF_LEN - 1);
402 printf("recv report:");
403 for (i = 0; i < cmd_len; i++)
404 printf(" %02x", buf[i]);
405 printf("\n");
406 }
407
408 if (FD_ISSET(STDIN_FILENO, &rfds)) {
409 memset(report, 0x0, sizeof(report));
410 cmd_len = read(STDIN_FILENO, buf, BUF_LEN - 1);
411
412 if (cmd_len == 0)
413 break;
414
415 buf[cmd_len - 1] = '\0';
416 hold = 0;
417
418 memset(report, 0x0, sizeof(report));
419 if (argv[2][0] == 'k')
420 to_send = keyboard_fill_report(report, buf, &hold);
421 else if (argv[2][0] == 'm')
422 to_send = mouse_fill_report(report, buf, &hold);
423 else
424 to_send = joystick_fill_report(report, buf, &hold);
425
426 if (to_send == -1)
427 break;
428
429 if (write(fd, report, to_send) != to_send) {
430 perror(filename);
431 return 5;
432 }
433 if (!hold) {
434 memset(report, 0x0, sizeof(report));
435 if (write(fd, report, to_send) != to_send) {
436 perror(filename);
437 return 6;
438 }
439 }
440 }
441 }
442
443 close(fd);
444 return 0;
445}
diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt
index 3bf6818c8cf5..b29d8e56cf28 100644
--- a/Documentation/usb/power-management.txt
+++ b/Documentation/usb/power-management.txt
@@ -2,7 +2,7 @@
2 2
3 Alan Stern <stern@rowland.harvard.edu> 3 Alan Stern <stern@rowland.harvard.edu>
4 4
5 November 10, 2009 5 December 11, 2009
6 6
7 7
8 8
@@ -29,9 +29,9 @@ covered to some extent (see Documentation/power/*.txt for more
29information about system PM). 29information about system PM).
30 30
31Note: Dynamic PM support for USB is present only if the kernel was 31Note: Dynamic PM support for USB is present only if the kernel was
32built with CONFIG_USB_SUSPEND enabled. System PM support is present 32built with CONFIG_USB_SUSPEND enabled (which depends on
33only if the kernel was built with CONFIG_SUSPEND or CONFIG_HIBERNATION 33CONFIG_PM_RUNTIME). System PM support is present only if the kernel
34enabled. 34was built with CONFIG_SUSPEND or CONFIG_HIBERNATION enabled.
35 35
36 36
37 What is Remote Wakeup? 37 What is Remote Wakeup?
@@ -107,7 +107,9 @@ allowed to issue dynamic suspends.
107The user interface for controlling dynamic PM is located in the power/ 107The user interface for controlling dynamic PM is located in the power/
108subdirectory of each USB device's sysfs directory, that is, in 108subdirectory of each USB device's sysfs directory, that is, in
109/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The 109/sys/bus/usb/devices/.../power/ where "..." is the device's ID. The
110relevant attribute files are: wakeup, level, and autosuspend. 110relevant attribute files are: wakeup, control, and autosuspend.
111(There may also be a file named "level"; this file was deprecated
112as of the 2.6.35 kernel and replaced by the "control" file.)
111 113
112 power/wakeup 114 power/wakeup
113 115
@@ -120,7 +122,7 @@ relevant attribute files are: wakeup, level, and autosuspend.
120 while the device is suspended, the change won't take 122 while the device is suspended, the change won't take
121 effect until the following suspend.) 123 effect until the following suspend.)
122 124
123 power/level 125 power/control
124 126
125 This file contains one of two words: "on" or "auto". 127 This file contains one of two words: "on" or "auto".
126 You can write those words to the file to change the 128 You can write those words to the file to change the
@@ -148,14 +150,15 @@ relevant attribute files are: wakeup, level, and autosuspend.
148 never to autosuspend. You can write a number to the 150 never to autosuspend. You can write a number to the
149 file to change the autosuspend idle-delay time. 151 file to change the autosuspend idle-delay time.
150 152
151Writing "-1" to power/autosuspend and writing "on" to power/level do 153Writing "-1" to power/autosuspend and writing "on" to power/control do
152essentially the same thing -- they both prevent the device from being 154essentially the same thing -- they both prevent the device from being
153autosuspended. Yes, this is a redundancy in the API. 155autosuspended. Yes, this is a redundancy in the API.
154 156
155(In 2.6.21 writing "0" to power/autosuspend would prevent the device 157(In 2.6.21 writing "0" to power/autosuspend would prevent the device
156from being autosuspended; the behavior was changed in 2.6.22. The 158from being autosuspended; the behavior was changed in 2.6.22. The
157power/autosuspend attribute did not exist prior to 2.6.21, and the 159power/autosuspend attribute did not exist prior to 2.6.21, and the
158power/level attribute did not exist prior to 2.6.22.) 160power/level attribute did not exist prior to 2.6.22. power/control
161was added in 2.6.34.)
159 162
160 163
161 Changing the default idle-delay time 164 Changing the default idle-delay time
@@ -212,7 +215,7 @@ among printers and scanners, but plenty of other types of device have
212the same deficiency. 215the same deficiency.
213 216
214For this reason, by default the kernel disables autosuspend (the 217For this reason, by default the kernel disables autosuspend (the
215power/level attribute is initialized to "on") for all devices other 218power/control attribute is initialized to "on") for all devices other
216than hubs. Hubs, at least, appear to be reasonably well-behaved in 219than hubs. Hubs, at least, appear to be reasonably well-behaved in
217this regard. 220this regard.
218 221
@@ -229,6 +232,11 @@ necessary operations by hand or add them to a udev script. You can
229also change the idle-delay time; 2 seconds is not the best choice for 232also change the idle-delay time; 2 seconds is not the best choice for
230every device. 233every device.
231 234
235If a driver knows that its device has proper suspend/resume support,
236it can enable autosuspend all by itself. For example, the video
237driver for a laptop's webcam might do this, since these devices are
238rarely used and so should normally be autosuspended.
239
232Sometimes it turns out that even when a device does work okay with 240Sometimes it turns out that even when a device does work okay with
233autosuspend there are still problems. For example, there are 241autosuspend there are still problems. For example, there are
234experimental patches adding autosuspend support to the usbhid driver, 242experimental patches adding autosuspend support to the usbhid driver,
@@ -321,69 +329,81 @@ driver does so by calling these six functions:
321 void usb_autopm_get_interface_no_resume(struct usb_interface *intf); 329 void usb_autopm_get_interface_no_resume(struct usb_interface *intf);
322 void usb_autopm_put_interface_no_suspend(struct usb_interface *intf); 330 void usb_autopm_put_interface_no_suspend(struct usb_interface *intf);
323 331
324The functions work by maintaining a counter in the usb_interface 332The functions work by maintaining a usage counter in the
325structure. When intf->pm_usage_count is > 0 then the interface is 333usb_interface's embedded device structure. When the counter is > 0
326deemed to be busy, and the kernel will not autosuspend the interface's 334then the interface is deemed to be busy, and the kernel will not
327device. When intf->pm_usage_count is <= 0 then the interface is 335autosuspend the interface's device. When the usage counter is = 0
328considered to be idle, and the kernel may autosuspend the device. 336then the interface is considered to be idle, and the kernel may
337autosuspend the device.
329 338
330(There is a similar pm_usage_count field in struct usb_device, 339(There is a similar usage counter field in struct usb_device,
331associated with the device itself rather than any of its interfaces. 340associated with the device itself rather than any of its interfaces.
332This field is used only by the USB core.) 341This counter is used only by the USB core.)
333 342
334Drivers must not modify intf->pm_usage_count directly; its value 343Drivers need not be concerned about balancing changes to the usage
335should be changed only be using the functions listed above. Drivers 344counter; the USB core will undo any remaining "get"s when a driver
336are responsible for insuring that the overall change to pm_usage_count 345is unbound from its interface. As a corollary, drivers must not call
337during their lifetime balances out to 0 (it may be necessary for the 346any of the usb_autopm_* functions after their diconnect() routine has
338disconnect method to call usb_autopm_put_interface() one or more times 347returned.
339to fulfill this requirement). The first two routines use the PM mutex 348
340in struct usb_device for mutual exclusion; drivers using the async 349Drivers using the async routines are responsible for their own
341routines are responsible for their own synchronization and mutual 350synchronization and mutual exclusion.
342exclusion. 351
343 352 usb_autopm_get_interface() increments the usage counter and
344 usb_autopm_get_interface() increments pm_usage_count and 353 does an autoresume if the device is suspended. If the
345 attempts an autoresume if the new value is > 0 and the 354 autoresume fails, the counter is decremented back.
346 device is suspended. 355
347 356 usb_autopm_put_interface() decrements the usage counter and
348 usb_autopm_put_interface() decrements pm_usage_count and 357 attempts an autosuspend if the new value is = 0.
349 attempts an autosuspend if the new value is <= 0 and the
350 device isn't suspended.
351 358
352 usb_autopm_get_interface_async() and 359 usb_autopm_get_interface_async() and
353 usb_autopm_put_interface_async() do almost the same things as 360 usb_autopm_put_interface_async() do almost the same things as
354 their non-async counterparts. The differences are: they do 361 their non-async counterparts. The big difference is that they
355 not acquire the PM mutex, and they use a workqueue to do their 362 use a workqueue to do the resume or suspend part of their
356 jobs. As a result they can be called in an atomic context, 363 jobs. As a result they can be called in an atomic context,
357 such as an URB's completion handler, but when they return the 364 such as an URB's completion handler, but when they return the
358 device will not generally not yet be in the desired state. 365 device will generally not yet be in the desired state.
359 366
360 usb_autopm_get_interface_no_resume() and 367 usb_autopm_get_interface_no_resume() and
361 usb_autopm_put_interface_no_suspend() merely increment or 368 usb_autopm_put_interface_no_suspend() merely increment or
362 decrement the pm_usage_count value; they do not attempt to 369 decrement the usage counter; they do not attempt to carry out
363 carry out an autoresume or an autosuspend. Hence they can be 370 an autoresume or an autosuspend. Hence they can be called in
364 called in an atomic context. 371 an atomic context.
365 372
366The conventional usage pattern is that a driver calls 373The simplest usage pattern is that a driver calls
367usb_autopm_get_interface() in its open routine and 374usb_autopm_get_interface() in its open routine and
368usb_autopm_put_interface() in its close or release routine. But 375usb_autopm_put_interface() in its close or release routine. But other
369other patterns are possible. 376patterns are possible.
370 377
371The autosuspend attempts mentioned above will often fail for one 378The autosuspend attempts mentioned above will often fail for one
372reason or another. For example, the power/level attribute might be 379reason or another. For example, the power/control attribute might be
373set to "on", or another interface in the same device might not be 380set to "on", or another interface in the same device might not be
374idle. This is perfectly normal. If the reason for failure was that 381idle. This is perfectly normal. If the reason for failure was that
375the device hasn't been idle for long enough, a delayed workqueue 382the device hasn't been idle for long enough, a timer is scheduled to
376routine is automatically set up to carry out the operation when the 383carry out the operation automatically when the autosuspend idle-delay
377autosuspend idle-delay has expired. 384has expired.
378 385
379Autoresume attempts also can fail, although failure would mean that 386Autoresume attempts also can fail, although failure would mean that
380the device is no longer present or operating properly. Unlike 387the device is no longer present or operating properly. Unlike
381autosuspend, there's no delay for an autoresume. 388autosuspend, there's no idle-delay for an autoresume.
382 389
383 390
384 Other parts of the driver interface 391 Other parts of the driver interface
385 ----------------------------------- 392 -----------------------------------
386 393
394Drivers can enable autosuspend for their devices by calling
395
396 usb_enable_autosuspend(struct usb_device *udev);
397
398in their probe() routine, if they know that the device is capable of
399suspending and resuming correctly. This is exactly equivalent to
400writing "auto" to the device's power/control attribute. Likewise,
401drivers can disable autosuspend by calling
402
403 usb_disable_autosuspend(struct usb_device *udev);
404
405This is exactly the same as writing "on" to the power/control attribute.
406
387Sometimes a driver needs to make sure that remote wakeup is enabled 407Sometimes a driver needs to make sure that remote wakeup is enabled
388during autosuspend. For example, there's not much point 408during autosuspend. For example, there's not much point
389autosuspending a keyboard if the user can't cause the keyboard to do a 409autosuspending a keyboard if the user can't cause the keyboard to do a
@@ -395,26 +415,27 @@ though, setting this flag won't cause the kernel to autoresume it.
395Normally a driver would set this flag in its probe method, at which 415Normally a driver would set this flag in its probe method, at which
396time the device is guaranteed not to be autosuspended.) 416time the device is guaranteed not to be autosuspended.)
397 417
398The synchronous usb_autopm_* routines have to run in a sleepable 418If a driver does its I/O asynchronously in interrupt context, it
399process context; they must not be called from an interrupt handler or 419should call usb_autopm_get_interface_async() before starting output and
400while holding a spinlock. In fact, the entire autosuspend mechanism 420usb_autopm_put_interface_async() when the output queue drains. When
401is not well geared toward interrupt-driven operation. However there 421it receives an input event, it should call
402is one thing a driver can do in an interrupt handler:
403 422
404 usb_mark_last_busy(struct usb_device *udev); 423 usb_mark_last_busy(struct usb_device *udev);
405 424
406This sets udev->last_busy to the current time. udev->last_busy is the 425in the event handler. This sets udev->last_busy to the current time.
407field used for idle-delay calculations; updating it will cause any 426udev->last_busy is the field used for idle-delay calculations;
408pending autosuspend to be moved back. The usb_autopm_* routines will 427updating it will cause any pending autosuspend to be moved back. Most
409also set the last_busy field to the current time. 428of the usb_autopm_* routines will also set the last_busy field to the
410 429current time.
411Calling urb_mark_last_busy() from within an URB completion handler is 430
412subject to races: The kernel may have just finished deciding the 431Asynchronous operation is always subject to races. For example, a
413device has been idle for long enough but not yet gotten around to 432driver may call one of the usb_autopm_*_interface_async() routines at
414calling the driver's suspend method. The driver would have to be 433a time when the core has just finished deciding the device has been
415responsible for synchronizing its suspend method with its URB 434idle for long enough but not yet gotten around to calling the driver's
416completion handler and causing the autosuspend to fail with -EBUSY if 435suspend method. The suspend method must be responsible for
417an URB had completed too recently. 436synchronizing with the output request routine and the URB completion
437handler; it should cause autosuspends to fail with -EBUSY if the
438driver needs to use the device.
418 439
419External suspend calls should never be allowed to fail in this way, 440External suspend calls should never be allowed to fail in this way,
420only autosuspend calls. The driver can tell them apart by checking 441only autosuspend calls. The driver can tell them apart by checking
@@ -422,75 +443,23 @@ the PM_EVENT_AUTO bit in the message.event argument to the suspend
422method; this bit will be set for internal PM events (autosuspend) and 443method; this bit will be set for internal PM events (autosuspend) and
423clear for external PM events. 444clear for external PM events.
424 445
425Many of the ingredients in the autosuspend framework are oriented
426towards interfaces: The usb_interface structure contains the
427pm_usage_cnt field, and the usb_autopm_* routines take an interface
428pointer as their argument. But somewhat confusingly, a few of the
429pieces (i.e., usb_mark_last_busy()) use the usb_device structure
430instead. Drivers need to keep this straight; they can call
431interface_to_usbdev() to find the device structure for a given
432interface.
433
434 446
435 Locking requirements 447 Mutual exclusion
436 -------------------- 448 ----------------
437 449
438All three suspend/resume methods are always called while holding the 450For external events -- but not necessarily for autosuspend or
439usb_device's PM mutex. For external events -- but not necessarily for 451autoresume -- the device semaphore (udev->dev.sem) will be held when a
440autosuspend or autoresume -- the device semaphore (udev->dev.sem) will 452suspend or resume method is called. This implies that external
441also be held. This implies that external suspend/resume events are 453suspend/resume events are mutually exclusive with calls to probe,
442mutually exclusive with calls to probe, disconnect, pre_reset, and 454disconnect, pre_reset, and post_reset; the USB core guarantees that
443post_reset; the USB core guarantees that this is true of internal 455this is true of autosuspend/autoresume events as well.
444suspend/resume events as well.
445 456
446If a driver wants to block all suspend/resume calls during some 457If a driver wants to block all suspend/resume calls during some
447critical section, it can simply acquire udev->pm_mutex. Note that 458critical section, the best way is to lock the device and call
448calls to resume may be triggered indirectly. Block IO due to memory 459usb_autopm_get_interface() (and do the reverse at the end of the
449allocations can make the vm subsystem resume a device. Thus while 460critical section). Holding the device semaphore will block all
450holding this lock you must not allocate memory with GFP_KERNEL or 461external PM calls, and the usb_autopm_get_interface() will prevent any
451GFP_NOFS. 462internal PM calls, even if it fails. (Exercise: Why?)
452
453Alternatively, if the critical section might call some of the
454usb_autopm_* routines, the driver can avoid deadlock by doing:
455
456 down(&udev->dev.sem);
457 rc = usb_autopm_get_interface(intf);
458
459and at the end of the critical section:
460
461 if (!rc)
462 usb_autopm_put_interface(intf);
463 up(&udev->dev.sem);
464
465Holding the device semaphore will block all external PM calls, and the
466usb_autopm_get_interface() will prevent any internal PM calls, even if
467it fails. (Exercise: Why?)
468
469The rules for locking order are:
470
471 Never acquire any device semaphore while holding any PM mutex.
472
473 Never acquire udev->pm_mutex while holding the PM mutex for
474 a device that isn't a descendant of udev.
475
476In other words, PM mutexes should only be acquired going up the device
477tree, and they should be acquired only after locking all the device
478semaphores you need to hold. These rules don't matter to drivers very
479much; they usually affect just the USB core.
480
481Still, drivers do need to be careful. For example, many drivers use a
482private mutex to synchronize their normal I/O activities with their
483disconnect method. Now if the driver supports autosuspend then it
484must call usb_autopm_put_interface() from somewhere -- maybe from its
485close method. It should make the call while holding the private mutex,
486since a driver shouldn't call any of the usb_autopm_* functions for an
487interface from which it has been unbound.
488
489But the usb_autpm_* routines always acquire the device's PM mutex, and
490consequently the locking order has to be: private mutex first, PM
491mutex second. Since the suspend method is always called with the PM
492mutex held, it mustn't try to acquire the private mutex. It has to
493synchronize with the driver's I/O activities in some other way.
494 463
495 464
496 Interaction between dynamic PM and system PM 465 Interaction between dynamic PM and system PM
@@ -499,22 +468,11 @@ synchronize with the driver's I/O activities in some other way.
499Dynamic power management and system power management can interact in 468Dynamic power management and system power management can interact in
500a couple of ways. 469a couple of ways.
501 470
502Firstly, a device may already be manually suspended or autosuspended 471Firstly, a device may already be autosuspended when a system suspend
503when a system suspend occurs. Since system suspends are supposed to 472occurs. Since system suspends are supposed to be as transparent as
504be as transparent as possible, the device should remain suspended 473possible, the device should remain suspended following the system
505following the system resume. The 2.6.23 kernel obeys this principle 474resume. But this theory may not work out well in practice; over time
506for manually suspended devices but not for autosuspended devices; they 475the kernel's behavior in this regard has changed.
507do get resumed when the system wakes up. (Presumably they will be
508autosuspended again after their idle-delay time expires.) In later
509kernels this behavior will be fixed.
510
511(There is an exception. If a device would undergo a reset-resume
512instead of a normal resume, and the device is enabled for remote
513wakeup, then the reset-resume takes place even if the device was
514already suspended when the system suspend began. The justification is
515that a reset-resume is a kind of remote-wakeup event. Or to put it
516another way, a device which needs a reset won't be able to generate
517normal remote-wakeup signals, so it ought to be resumed immediately.)
518 476
519Secondly, a dynamic power-management event may occur as a system 477Secondly, a dynamic power-management event may occur as a system
520suspend is underway. The window for this is short, since system 478suspend is underway. The window for this is short, since system
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index ff2c1ff57ba2..f4d214510259 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -194,6 +194,10 @@ FTDI Single Port Serial Driver
194 194
195 This is a single port DB-25 serial adapter. 195 This is a single port DB-25 serial adapter.
196 196
197 Devices supported include:
198 -TripNav TN-200 USB GPS
199 -Navis Engineering Bureau CH-4711 USB GPS
200
197 For any questions or problems with this driver, please contact Bill Ryder. 201 For any questions or problems with this driver, please contact Bill Ryder.
198 202
199 203
@@ -216,7 +220,7 @@ Cypress M8 CY4601 Family Serial Driver
216 220
217 Devices supported: 221 Devices supported:
218 222
219 -DeLorme's USB Earthmate (SiRF Star II lp arch) 223 -DeLorme's USB Earthmate GPS (SiRF Star II lp arch)
220 -Cypress HID->COM RS232 adapter 224 -Cypress HID->COM RS232 adapter
221 225
222 Note: Cypress Semiconductor claims no affiliation with the 226 Note: Cypress Semiconductor claims no affiliation with the
@@ -392,9 +396,10 @@ REINER SCT cyberJack pinpad/e-com USB chipcard reader
392Prolific PL2303 Driver 396Prolific PL2303 Driver
393 397
394 This driver supports any device that has the PL2303 chip from Prolific 398 This driver supports any device that has the PL2303 chip from Prolific
395 in it. This includes a number of single port USB to serial 399 in it. This includes a number of single port USB to serial converters,
396 converters and USB GPS devices. Devices from Aten (the UC-232) and 400 more than 70% of USB GPS devices (in 2010), and some USB UPSes. Devices
397 IO-Data work with this driver, as does the DCU-11 mobile-phone cable. 401 from Aten (the UC-232) and IO-Data work with this driver, as does
402 the DCU-11 mobile-phone cable.
398 403
399 For any questions or problems with this driver, please contact Greg 404 For any questions or problems with this driver, please contact Greg
400 Kroah-Hartman at greg@kroah.com 405 Kroah-Hartman at greg@kroah.com
@@ -435,6 +440,22 @@ Winchiphead CH341 Driver
435 For any questions or problems with this driver, please contact 440 For any questions or problems with this driver, please contact
436 frank@kingswood-consulting.co.uk. 441 frank@kingswood-consulting.co.uk.
437 442
443Moschip MCS7720, MCS7715 driver
444
445 These chips are present in devices sold by various manufacturers, such as Syba
446 and Cables Unlimited. There may be others. The 7720 provides two serial
447 ports, and the 7715 provides one serial and one standard PC parallel port.
448 Support for the 7715's parallel port is enabled by a separate option, which
449 will not appear unless parallel port support is first enabled at the top-level
450 of the Device Drivers config menu. Currently only compatibility mode is
451 supported on the parallel port (no ECP/EPP).
452
453 TODO:
454 - Implement ECP/EPP modes for the parallel port.
455 - Baud rates higher than 115200 are currently broken.
456 - Devices with a single serial port based on the Moschip MCS7703 may work
457 with this driver with a simple addition to the usb_device_id table. I
458 don't have one of these devices, so I can't say for sure.
438 459
439Generic Serial driver 460Generic Serial driver
440 461
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
index f11c583295e9..4739d5684305 100644
--- a/Documentation/video4linux/CARDLIST.bttv
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -100,7 +100,7 @@
100 99 -> AD-TVK503 100 99 -> AD-TVK503
101100 -> Hercules Smart TV Stereo 101100 -> Hercules Smart TV Stereo
102101 -> Pace TV & Radio Card 102101 -> Pace TV & Radio Card
103102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155] 103102 -> IVC-200 [0000:a155,0001:a155,0002:a155,0003:a155,0100:a155,0101:a155,0102:a155,0103:a155,0800:a155,0801:a155,0802:a155,0803:a155]
104103 -> Grand X-Guard / Trust 814PCI [0304:0102] 104103 -> Grand X-Guard / Trust 814PCI [0304:0102]
105104 -> Nebula Electronics DigiTV [0071:0101] 105104 -> Nebula Electronics DigiTV [0071:0101]
106105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433] 106105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433]
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 7539e8fa1ffd..16ca030e1185 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -26,3 +26,4 @@
26 25 -> Compro VideoMate E800 [1858:e800] 26 25 -> Compro VideoMate E800 [1858:e800]
27 26 -> Hauppauge WinTV-HVR1290 [0070:8551] 27 26 -> Hauppauge WinTV-HVR1290 [0070:8551]
28 27 -> Mygica X8558 PRO DMB-TH [14f1:8578] 28 27 -> Mygica X8558 PRO DMB-TH [14f1:8578]
29 28 -> LEADTEK WinFast PxTV1200 [107d:6f22]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 7ec3c4e4b60f..f2510541373b 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -82,3 +82,4 @@
82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654] 82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654]
83 82 -> WinFast DTV2000 H rev. J [107d:6f2b] 83 82 -> WinFast DTV2000 H rev. J [107d:6f2b]
84 83 -> Prof 7301 DVB-S/S2 [b034:3034] 84 83 -> Prof 7301 DVB-S/S2 [b034:3034]
85 84 -> Samsung SMT 7020 DVB-S [18ac:dc00,18ac:dccd]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 0c166ff003a0..3a623aaeae5f 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -1,5 +1,5 @@
1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] 1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800]
2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868] 2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2862,eb1a:2863,eb1a:2870,eb1a:2881,eb1a:2883,eb1a:2868]
3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] 3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036]
4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] 4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208]
5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] 5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201]
@@ -27,6 +27,7 @@
27 26 -> Hercules Smart TV USB 2.0 (em2820/em2840) 27 26 -> Hercules Smart TV USB 2.0 (em2820/em2840)
28 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840) 28 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840)
29 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840) 29 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840)
30 29 -> EM2860/TVP5150 Reference Design (em2860)
30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) 31 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840)
31 31 -> Usbgear VD204v9 (em2821) 32 31 -> Usbgear VD204v9 (em2821)
32 32 -> Supercomp USB 2.0 TV (em2821) 33 32 -> Supercomp USB 2.0 TV (em2821)
@@ -70,3 +71,4 @@
70 72 -> Gadmei UTV330+ (em2861) 71 72 -> Gadmei UTV330+ (em2861)
71 73 -> Reddo DVB-C USB TV Box (em2870) 72 73 -> Reddo DVB-C USB TV Box (em2870)
72 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800) 73 74 -> Actionmaster/LinXcel/Digitus VC211A (em2800)
74 75 -> Dikom DK300 (em2882)
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index fce1e7eb0474..070f2576707e 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -174,3 +174,7 @@
174173 -> Zolid Hybrid TV Tuner PCI [1131:2004] 174173 -> Zolid Hybrid TV Tuner PCI [1131:2004]
175174 -> Asus Europa Hybrid OEM [1043:4847] 175174 -> Asus Europa Hybrid OEM [1043:4847]
176175 -> Leadtek Winfast DTV1000S [107d:6655] 176175 -> Leadtek Winfast DTV1000S [107d:6655]
177176 -> Beholder BeholdTV 505 RDS [0000:5051]
178177 -> Hawell HW-404M7
179179 -> Beholder BeholdTV H7 [5ace:7190]
180180 -> Beholder BeholdTV A7 [5ace:7090]
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner
index e0d298fe8830..9b2e0dd6017e 100644
--- a/Documentation/video4linux/CARDLIST.tuner
+++ b/Documentation/video4linux/CARDLIST.tuner
@@ -81,3 +81,4 @@ tuner=80 - Philips FQ1216LME MK3 PAL/SECAM w/active loopthrough
81tuner=81 - Partsnic (Daewoo) PTI-5NF05 81tuner=81 - Partsnic (Daewoo) PTI-5NF05
82tuner=82 - Philips CU1216L 82tuner=82 - Philips CU1216L
83tuner=83 - NXP TDA18271 83tuner=83 - NXP TDA18271
84tuner=84 - Sony BTF-Pxn01Z
diff --git a/Documentation/video4linux/README.tlg2300 b/Documentation/video4linux/README.tlg2300
new file mode 100644
index 000000000000..416ccb93d8c9
--- /dev/null
+++ b/Documentation/video4linux/README.tlg2300
@@ -0,0 +1,47 @@
1tlg2300 release notes
2====================
3
4This is a v4l2/dvb device driver for the tlg2300 chip.
5
6
7current status
8==============
9
10video
11 - support mmap and read().(no overlay)
12
13audio
14 - The driver will register a ALSA card for the audio input.
15
16vbi
17 - Works for almost TV norms.
18
19dvb-t
20 - works for DVB-T
21
22FM
23 - Works for radio.
24
25---------------------------------------------------------------------------
26TESTED APPLICATIONS:
27
28-VLC1.0.4 test the video and dvb. The GUI is friendly to use.
29
30-Mplayer test the video.
31
32-Mplayer test the FM. The mplayer should be compiled with --enable-radio and
33 --enable-radio-capture.
34 The command runs as this(The alsa audio registers to card 1):
35 #mplayer radio://103.7/capture/ -radio adevice=hw=1,0:arate=48000 \
36 -rawaudio rate=48000:channels=2
37
38---------------------------------------------------------------------------
39KNOWN PROBLEMS:
40about preemphasis:
41 You can set the preemphasis for radio by the following command:
42 #v4l2-ctl -d /dev/radio0 --set-ctrl=pre_emphasis_settings=1
43
44 "pre_emphasis_settings=1" means that you select the 50us. If you want
45 to select the 75us, please use "pre_emphasis_settings=2"
46
47
diff --git a/Documentation/video4linux/extract_xc3028.pl b/Documentation/video4linux/extract_xc3028.pl
index 2cb816047fc1..47877deae6d7 100644
--- a/Documentation/video4linux/extract_xc3028.pl
+++ b/Documentation/video4linux/extract_xc3028.pl
@@ -5,12 +5,18 @@
5# 5#
6# In order to use, you need to: 6# In order to use, you need to:
7# 1) Download the windows driver with something like: 7# 1) Download the windows driver with something like:
8# Version 2.4
9# wget http://www.twinhan.com/files/AW/BDA T/20080303_V1.0.6.7.zip
10# or wget http://www.stefanringel.de/pub/20080303_V1.0.6.7.zip
11# Version 2.7
8# wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip 12# wget http://www.steventoth.net/linux/xc5000/HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip
9# 2) Extract the file hcw85bda.sys from the zip into the current dir: 13# 2) Extract the files from the zip into the current dir:
14# unzip -j 20080303_V1.0.6.7.zip 20080303_v1.0.6.7/UDXTTM6000.sys
10# unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys 15# unzip -j HVR-12x0-14x0-17x0_1_25_25271_WHQL.zip Driver85/hcw85bda.sys
11# 3) run the script: 16# 3) run the script:
12# ./extract_xc3028.pl 17# ./extract_xc3028.pl
13# 4) copy the generated file: 18# 4) copy the generated files:
19# cp xc3028-v24.fw /lib/firmware
14# cp xc3028-v27.fw /lib/firmware 20# cp xc3028-v27.fw /lib/firmware
15 21
16#use strict; 22#use strict;
@@ -135,7 +141,7 @@ sub write_hunk_fix_endian($$)
135 } 141 }
136} 142}
137 143
138sub main_firmware($$$$) 144sub main_firmware_24($$$$)
139{ 145{
140 my $out; 146 my $out;
141 my $j=0; 147 my $j=0;
@@ -146,8 +152,774 @@ sub main_firmware($$$$)
146 152
147 for ($j = length($name); $j <32; $j++) { 153 for ($j = length($name); $j <32; $j++) {
148 $name = $name.chr(0); 154 $name = $name.chr(0);
155 }
156
157 open OUTFILE, ">$outfile";
158 syswrite(OUTFILE, $name);
159 write_le16($version);
160 write_le16($nr_desc);
161
162 #
163 # Firmware 0, type: BASE FW F8MHZ (0x00000003), id: (0000000000000000), size: 6635
164 #
165
166 write_le32(0x00000003); # Type
167 write_le64(0x00000000, 0x00000000); # ID
168 write_le32(6635); # Size
169 write_hunk_fix_endian(257752, 6635);
170
171 #
172 # Firmware 1, type: BASE FW F8MHZ MTS (0x00000007), id: (0000000000000000), size: 6635
173 #
174
175 write_le32(0x00000007); # Type
176 write_le64(0x00000000, 0x00000000); # ID
177 write_le32(6635); # Size
178 write_hunk_fix_endian(264392, 6635);
179
180 #
181 # Firmware 2, type: BASE FW FM (0x00000401), id: (0000000000000000), size: 6525
182 #
183
184 write_le32(0x00000401); # Type
185 write_le64(0x00000000, 0x00000000); # ID
186 write_le32(6525); # Size
187 write_hunk_fix_endian(271040, 6525);
188
189 #
190 # Firmware 3, type: BASE FW FM INPUT1 (0x00000c01), id: (0000000000000000), size: 6539
191 #
192
193 write_le32(0x00000c01); # Type
194 write_le64(0x00000000, 0x00000000); # ID
195 write_le32(6539); # Size
196 write_hunk_fix_endian(277568, 6539);
197
198 #
199 # Firmware 4, type: BASE FW (0x00000001), id: (0000000000000000), size: 6633
200 #
201
202 write_le32(0x00000001); # Type
203 write_le64(0x00000000, 0x00000000); # ID
204 write_le32(6633); # Size
205 write_hunk_fix_endian(284120, 6633);
206
207 #
208 # Firmware 5, type: BASE FW MTS (0x00000005), id: (0000000000000000), size: 6617
209 #
210
211 write_le32(0x00000005); # Type
212 write_le64(0x00000000, 0x00000000); # ID
213 write_le32(6617); # Size
214 write_hunk_fix_endian(290760, 6617);
215
216 #
217 # Firmware 6, type: STD FW (0x00000000), id: PAL/BG A2/A (0000000100000007), size: 161
218 #
219
220 write_le32(0x00000000); # Type
221 write_le64(0x00000001, 0x00000007); # ID
222 write_le32(161); # Size
223 write_hunk_fix_endian(297384, 161);
224
225 #
226 # Firmware 7, type: STD FW MTS (0x00000004), id: PAL/BG A2/A (0000000100000007), size: 169
227 #
228
229 write_le32(0x00000004); # Type
230 write_le64(0x00000001, 0x00000007); # ID
231 write_le32(169); # Size
232 write_hunk_fix_endian(297552, 169);
233
234 #
235 # Firmware 8, type: STD FW (0x00000000), id: PAL/BG A2/B (0000000200000007), size: 161
236 #
237
238 write_le32(0x00000000); # Type
239 write_le64(0x00000002, 0x00000007); # ID
240 write_le32(161); # Size
241 write_hunk_fix_endian(297728, 161);
242
243 #
244 # Firmware 9, type: STD FW MTS (0x00000004), id: PAL/BG A2/B (0000000200000007), size: 169
245 #
246
247 write_le32(0x00000004); # Type
248 write_le64(0x00000002, 0x00000007); # ID
249 write_le32(169); # Size
250 write_hunk_fix_endian(297896, 169);
251
252 #
253 # Firmware 10, type: STD FW (0x00000000), id: PAL/BG NICAM/A (0000000400000007), size: 161
254 #
255
256 write_le32(0x00000000); # Type
257 write_le64(0x00000004, 0x00000007); # ID
258 write_le32(161); # Size
259 write_hunk_fix_endian(298072, 161);
260
261 #
262 # Firmware 11, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/A (0000000400000007), size: 169
263 #
264
265 write_le32(0x00000004); # Type
266 write_le64(0x00000004, 0x00000007); # ID
267 write_le32(169); # Size
268 write_hunk_fix_endian(298240, 169);
269
270 #
271 # Firmware 12, type: STD FW (0x00000000), id: PAL/BG NICAM/B (0000000800000007), size: 161
272 #
273
274 write_le32(0x00000000); # Type
275 write_le64(0x00000008, 0x00000007); # ID
276 write_le32(161); # Size
277 write_hunk_fix_endian(298416, 161);
278
279 #
280 # Firmware 13, type: STD FW MTS (0x00000004), id: PAL/BG NICAM/B (0000000800000007), size: 169
281 #
282
283 write_le32(0x00000004); # Type
284 write_le64(0x00000008, 0x00000007); # ID
285 write_le32(169); # Size
286 write_hunk_fix_endian(298584, 169);
287
288 #
289 # Firmware 14, type: STD FW (0x00000000), id: PAL/DK A2 (00000003000000e0), size: 161
290 #
291
292 write_le32(0x00000000); # Type
293 write_le64(0x00000003, 0x000000e0); # ID
294 write_le32(161); # Size
295 write_hunk_fix_endian(298760, 161);
296
297 #
298 # Firmware 15, type: STD FW MTS (0x00000004), id: PAL/DK A2 (00000003000000e0), size: 169
299 #
300
301 write_le32(0x00000004); # Type
302 write_le64(0x00000003, 0x000000e0); # ID
303 write_le32(169); # Size
304 write_hunk_fix_endian(298928, 169);
305
306 #
307 # Firmware 16, type: STD FW (0x00000000), id: PAL/DK NICAM (0000000c000000e0), size: 161
308 #
309
310 write_le32(0x00000000); # Type
311 write_le64(0x0000000c, 0x000000e0); # ID
312 write_le32(161); # Size
313 write_hunk_fix_endian(299104, 161);
314
315 #
316 # Firmware 17, type: STD FW MTS (0x00000004), id: PAL/DK NICAM (0000000c000000e0), size: 169
317 #
318
319 write_le32(0x00000004); # Type
320 write_le64(0x0000000c, 0x000000e0); # ID
321 write_le32(169); # Size
322 write_hunk_fix_endian(299272, 169);
323
324 #
325 # Firmware 18, type: STD FW (0x00000000), id: SECAM/K1 (0000000000200000), size: 161
326 #
327
328 write_le32(0x00000000); # Type
329 write_le64(0x00000000, 0x00200000); # ID
330 write_le32(161); # Size
331 write_hunk_fix_endian(299448, 161);
332
333 #
334 # Firmware 19, type: STD FW MTS (0x00000004), id: SECAM/K1 (0000000000200000), size: 169
335 #
336
337 write_le32(0x00000004); # Type
338 write_le64(0x00000000, 0x00200000); # ID
339 write_le32(169); # Size
340 write_hunk_fix_endian(299616, 169);
341
342 #
343 # Firmware 20, type: STD FW (0x00000000), id: SECAM/K3 (0000000004000000), size: 161
344 #
345
346 write_le32(0x00000000); # Type
347 write_le64(0x00000000, 0x04000000); # ID
348 write_le32(161); # Size
349 write_hunk_fix_endian(299792, 161);
350
351 #
352 # Firmware 21, type: STD FW MTS (0x00000004), id: SECAM/K3 (0000000004000000), size: 169
353 #
354
355 write_le32(0x00000004); # Type
356 write_le64(0x00000000, 0x04000000); # ID
357 write_le32(169); # Size
358 write_hunk_fix_endian(299960, 169);
359
360 #
361 # Firmware 22, type: STD FW D2633 DTV6 ATSC (0x00010030), id: (0000000000000000), size: 149
362 #
363
364 write_le32(0x00010030); # Type
365 write_le64(0x00000000, 0x00000000); # ID
366 write_le32(149); # Size
367 write_hunk_fix_endian(300136, 149);
368
369 #
370 # Firmware 23, type: STD FW D2620 DTV6 QAM (0x00000068), id: (0000000000000000), size: 149
371 #
372
373 write_le32(0x00000068); # Type
374 write_le64(0x00000000, 0x00000000); # ID
375 write_le32(149); # Size
376 write_hunk_fix_endian(300296, 149);
377
378 #
379 # Firmware 24, type: STD FW D2633 DTV6 QAM (0x00000070), id: (0000000000000000), size: 149
380 #
381
382 write_le32(0x00000070); # Type
383 write_le64(0x00000000, 0x00000000); # ID
384 write_le32(149); # Size
385 write_hunk_fix_endian(300448, 149);
386
387 #
388 # Firmware 25, type: STD FW D2620 DTV7 (0x00000088), id: (0000000000000000), size: 149
389 #
390
391 write_le32(0x00000088); # Type
392 write_le64(0x00000000, 0x00000000); # ID
393 write_le32(149); # Size
394 write_hunk_fix_endian(300608, 149);
395
396 #
397 # Firmware 26, type: STD FW D2633 DTV7 (0x00000090), id: (0000000000000000), size: 149
398 #
399
400 write_le32(0x00000090); # Type
401 write_le64(0x00000000, 0x00000000); # ID
402 write_le32(149); # Size
403 write_hunk_fix_endian(300760, 149);
404
405 #
406 # Firmware 27, type: STD FW D2620 DTV78 (0x00000108), id: (0000000000000000), size: 149
407 #
408
409 write_le32(0x00000108); # Type
410 write_le64(0x00000000, 0x00000000); # ID
411 write_le32(149); # Size
412 write_hunk_fix_endian(300920, 149);
413
414 #
415 # Firmware 28, type: STD FW D2633 DTV78 (0x00000110), id: (0000000000000000), size: 149
416 #
417
418 write_le32(0x00000110); # Type
419 write_le64(0x00000000, 0x00000000); # ID
420 write_le32(149); # Size
421 write_hunk_fix_endian(301072, 149);
422
423 #
424 # Firmware 29, type: STD FW D2620 DTV8 (0x00000208), id: (0000000000000000), size: 149
425 #
426
427 write_le32(0x00000208); # Type
428 write_le64(0x00000000, 0x00000000); # ID
429 write_le32(149); # Size
430 write_hunk_fix_endian(301232, 149);
431
432 #
433 # Firmware 30, type: STD FW D2633 DTV8 (0x00000210), id: (0000000000000000), size: 149
434 #
435
436 write_le32(0x00000210); # Type
437 write_le64(0x00000000, 0x00000000); # ID
438 write_le32(149); # Size
439 write_hunk_fix_endian(301384, 149);
440
441 #
442 # Firmware 31, type: STD FW FM (0x00000400), id: (0000000000000000), size: 135
443 #
444
445 write_le32(0x00000400); # Type
446 write_le64(0x00000000, 0x00000000); # ID
447 write_le32(135); # Size
448 write_hunk_fix_endian(301554, 135);
449
450 #
451 # Firmware 32, type: STD FW (0x00000000), id: PAL/I (0000000000000010), size: 161
452 #
453
454 write_le32(0x00000000); # Type
455 write_le64(0x00000000, 0x00000010); # ID
456 write_le32(161); # Size
457 write_hunk_fix_endian(301688, 161);
458
459 #
460 # Firmware 33, type: STD FW MTS (0x00000004), id: PAL/I (0000000000000010), size: 169
461 #
462
463 write_le32(0x00000004); # Type
464 write_le64(0x00000000, 0x00000010); # ID
465 write_le32(169); # Size
466 write_hunk_fix_endian(301856, 169);
467
468 #
469 # Firmware 34, type: STD FW (0x00000000), id: SECAM/L AM (0000001000400000), size: 169
470 #
471
472 #
473 # Firmware 35, type: STD FW (0x00000000), id: SECAM/L NICAM (0000000c00400000), size: 161
474 #
475
476 write_le32(0x00000000); # Type
477 write_le64(0x0000000c, 0x00400000); # ID
478 write_le32(161); # Size
479 write_hunk_fix_endian(302032, 161);
480
481 #
482 # Firmware 36, type: STD FW (0x00000000), id: SECAM/Lc (0000000000800000), size: 161
483 #
484
485 write_le32(0x00000000); # Type
486 write_le64(0x00000000, 0x00800000); # ID
487 write_le32(161); # Size
488 write_hunk_fix_endian(302200, 161);
489
490 #
491 # Firmware 37, type: STD FW (0x00000000), id: NTSC/M Kr (0000000000008000), size: 161
492 #
493
494 write_le32(0x00000000); # Type
495 write_le64(0x00000000, 0x00008000); # ID
496 write_le32(161); # Size
497 write_hunk_fix_endian(302368, 161);
498
499 #
500 # Firmware 38, type: STD FW LCD (0x00001000), id: NTSC/M Kr (0000000000008000), size: 161
501 #
502
503 write_le32(0x00001000); # Type
504 write_le64(0x00000000, 0x00008000); # ID
505 write_le32(161); # Size
506 write_hunk_fix_endian(302536, 161);
507
508 #
509 # Firmware 39, type: STD FW LCD NOGD (0x00003000), id: NTSC/M Kr (0000000000008000), size: 161
510 #
511
512 write_le32(0x00003000); # Type
513 write_le64(0x00000000, 0x00008000); # ID
514 write_le32(161); # Size
515 write_hunk_fix_endian(302704, 161);
516
517 #
518 # Firmware 40, type: STD FW MTS (0x00000004), id: NTSC/M Kr (0000000000008000), size: 169
519 #
520
521 write_le32(0x00000004); # Type
522 write_le64(0x00000000, 0x00008000); # ID
523 write_le32(169); # Size
524 write_hunk_fix_endian(302872, 169);
525
526 #
527 # Firmware 41, type: STD FW (0x00000000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
528 #
529
530 write_le32(0x00000000); # Type
531 write_le64(0x00000000, 0x0000b700); # ID
532 write_le32(161); # Size
533 write_hunk_fix_endian(303048, 161);
534
535 #
536 # Firmware 42, type: STD FW LCD (0x00001000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
537 #
538
539 write_le32(0x00001000); # Type
540 write_le64(0x00000000, 0x0000b700); # ID
541 write_le32(161); # Size
542 write_hunk_fix_endian(303216, 161);
543
544 #
545 # Firmware 43, type: STD FW LCD NOGD (0x00003000), id: NTSC PAL/M PAL/N (000000000000b700), size: 161
546 #
547
548 write_le32(0x00003000); # Type
549 write_le64(0x00000000, 0x0000b700); # ID
550 write_le32(161); # Size
551 write_hunk_fix_endian(303384, 161);
552
553 #
554 # Firmware 44, type: STD FW (0x00000000), id: NTSC/M Jp (0000000000002000), size: 161
555 #
556
557 write_le32(0x00000000); # Type
558 write_le64(0x00000000, 0x00002000); # ID
559 write_le32(161); # Size
560 write_hunk_fix_endian(303552, 161);
561
562 #
563 # Firmware 45, type: STD FW MTS (0x00000004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
564 #
565
566 write_le32(0x00000004); # Type
567 write_le64(0x00000000, 0x0000b700); # ID
568 write_le32(169); # Size
569 write_hunk_fix_endian(303720, 169);
570
571 #
572 # Firmware 46, type: STD FW MTS LCD (0x00001004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
573 #
574
575 write_le32(0x00001004); # Type
576 write_le64(0x00000000, 0x0000b700); # ID
577 write_le32(169); # Size
578 write_hunk_fix_endian(303896, 169);
579
580 #
581 # Firmware 47, type: STD FW MTS LCD NOGD (0x00003004), id: NTSC PAL/M PAL/N (000000000000b700), size: 169
582 #
583
584 write_le32(0x00003004); # Type
585 write_le64(0x00000000, 0x0000b700); # ID
586 write_le32(169); # Size
587 write_hunk_fix_endian(304072, 169);
588
589 #
590 # Firmware 48, type: SCODE FW HAS IF (0x60000000), IF = 3.28 MHz id: (0000000000000000), size: 192
591 #
592
593 write_le32(0x60000000); # Type
594 write_le64(0x00000000, 0x00000000); # ID
595 write_le16(3280); # IF
596 write_le32(192); # Size
597 write_hunk(309048, 192);
598
599 #
600 # Firmware 49, type: SCODE FW HAS IF (0x60000000), IF = 3.30 MHz id: (0000000000000000), size: 192
601 #
602
603# write_le32(0x60000000); # Type
604# write_le64(0x00000000, 0x00000000); # ID
605# write_le16(3300); # IF
606# write_le32(192); # Size
607# write_hunk(304440, 192);
608
609 #
610 # Firmware 50, type: SCODE FW HAS IF (0x60000000), IF = 3.44 MHz id: (0000000000000000), size: 192
611 #
612
613 write_le32(0x60000000); # Type
614 write_le64(0x00000000, 0x00000000); # ID
615 write_le16(3440); # IF
616 write_le32(192); # Size
617 write_hunk(309432, 192);
618
619 #
620 # Firmware 51, type: SCODE FW HAS IF (0x60000000), IF = 3.46 MHz id: (0000000000000000), size: 192
621 #
622
623 write_le32(0x60000000); # Type
624 write_le64(0x00000000, 0x00000000); # ID
625 write_le16(3460); # IF
626 write_le32(192); # Size
627 write_hunk(309624, 192);
628
629 #
630 # Firmware 52, type: SCODE FW DTV6 ATSC OREN36 HAS IF (0x60210020), IF = 3.80 MHz id: (0000000000000000), size: 192
631 #
632
633 write_le32(0x60210020); # Type
634 write_le64(0x00000000, 0x00000000); # ID
635 write_le16(3800); # IF
636 write_le32(192); # Size
637 write_hunk(306936, 192);
638
639 #
640 # Firmware 53, type: SCODE FW HAS IF (0x60000000), IF = 4.00 MHz id: (0000000000000000), size: 192
641 #
642
643 write_le32(0x60000000); # Type
644 write_le64(0x00000000, 0x00000000); # ID
645 write_le16(4000); # IF
646 write_le32(192); # Size
647 write_hunk(309240, 192);
648
649 #
650 # Firmware 54, type: SCODE FW DTV6 ATSC TOYOTA388 HAS IF (0x60410020), IF = 4.08 MHz id: (0000000000000000), size: 192
651 #
652
653 write_le32(0x60410020); # Type
654 write_le64(0x00000000, 0x00000000); # ID
655 write_le16(4080); # IF
656 write_le32(192); # Size
657 write_hunk(307128, 192);
658
659 #
660 # Firmware 55, type: SCODE FW HAS IF (0x60000000), IF = 4.20 MHz id: (0000000000000000), size: 192
661 #
662
663 write_le32(0x60000000); # Type
664 write_le64(0x00000000, 0x00000000); # ID
665 write_le16(4200); # IF
666 write_le32(192); # Size
667 write_hunk(308856, 192);
668
669 #
670 # Firmware 56, type: SCODE FW MONO HAS IF (0x60008000), IF = 4.32 MHz id: NTSC/M Kr (0000000000008000), size: 192
671 #
672
673 write_le32(0x60008000); # Type
674 write_le64(0x00000000, 0x00008000); # ID
675 write_le16(4320); # IF
676 write_le32(192); # Size
677 write_hunk(305208, 192);
678
679 #
680 # Firmware 57, type: SCODE FW HAS IF (0x60000000), IF = 4.45 MHz id: (0000000000000000), size: 192
681 #
682
683 write_le32(0x60000000); # Type
684 write_le64(0x00000000, 0x00000000); # ID
685 write_le16(4450); # IF
686 write_le32(192); # Size
687 write_hunk(309816, 192);
688
689 #
690 # Firmware 58, type: SCODE FW MTS LCD NOGD MONO IF HAS IF (0x6002b004), IF = 4.50 MHz id: NTSC PAL/M PAL/N (000000000000b700), size: 192
691 #
692
693 write_le32(0x6002b004); # Type
694 write_le64(0x00000000, 0x0000b700); # ID
695 write_le16(4500); # IF
696 write_le32(192); # Size
697 write_hunk(304824, 192);
698
699 #
700 # Firmware 59, type: SCODE FW LCD NOGD IF HAS IF (0x60023000), IF = 4.60 MHz id: NTSC/M Kr (0000000000008000), size: 192
701 #
702
703 write_le32(0x60023000); # Type
704 write_le64(0x00000000, 0x00008000); # ID
705 write_le16(4600); # IF
706 write_le32(192); # Size
707 write_hunk(305016, 192);
708
709 #
710 # Firmware 60, type: SCODE FW DTV6 QAM DTV7 DTV78 DTV8 ZARLINK456 HAS IF (0x620003e0), IF = 4.76 MHz id: (0000000000000000), size: 192
711 #
712
713 write_le32(0x620003e0); # Type
714 write_le64(0x00000000, 0x00000000); # ID
715 write_le16(4760); # IF
716 write_le32(192); # Size
717 write_hunk(304440, 192);
718
719 #
720 # Firmware 61, type: SCODE FW HAS IF (0x60000000), IF = 4.94 MHz id: (0000000000000000), size: 192
721 #
722
723 write_le32(0x60000000); # Type
724 write_le64(0x00000000, 0x00000000); # ID
725 write_le16(4940); # IF
726 write_le32(192); # Size
727 write_hunk(308664, 192);
728
729 #
730 # Firmware 62, type: SCODE FW HAS IF (0x60000000), IF = 5.26 MHz id: (0000000000000000), size: 192
731 #
732
733 write_le32(0x60000000); # Type
734 write_le64(0x00000000, 0x00000000); # ID
735 write_le16(5260); # IF
736 write_le32(192); # Size
737 write_hunk(307704, 192);
738
739 #
740 # Firmware 63, type: SCODE FW MONO HAS IF (0x60008000), IF = 5.32 MHz id: PAL/BG A2 NICAM (0000000f00000007), size: 192
741 #
742
743 write_le32(0x60008000); # Type
744 write_le64(0x0000000f, 0x00000007); # ID
745 write_le16(5320); # IF
746 write_le32(192); # Size
747 write_hunk(307896, 192);
748
749 #
750 # Firmware 64, type: SCODE FW DTV7 DTV78 DTV8 DIBCOM52 CHINA HAS IF (0x65000380), IF = 5.40 MHz id: (0000000000000000), size: 192
751 #
752
753 write_le32(0x65000380); # Type
754 write_le64(0x00000000, 0x00000000); # ID
755 write_le16(5400); # IF
756 write_le32(192); # Size
757 write_hunk(304248, 192);
758
759 #
760 # Firmware 65, type: SCODE FW DTV6 ATSC OREN538 HAS IF (0x60110020), IF = 5.58 MHz id: (0000000000000000), size: 192
761 #
762
763 write_le32(0x60110020); # Type
764 write_le64(0x00000000, 0x00000000); # ID
765 write_le16(5580); # IF
766 write_le32(192); # Size
767 write_hunk(306744, 192);
768
769 #
770 # Firmware 66, type: SCODE FW HAS IF (0x60000000), IF = 5.64 MHz id: PAL/BG A2 (0000000300000007), size: 192
771 #
772
773 write_le32(0x60000000); # Type
774 write_le64(0x00000003, 0x00000007); # ID
775 write_le16(5640); # IF
776 write_le32(192); # Size
777 write_hunk(305592, 192);
778
779 #
780 # Firmware 67, type: SCODE FW HAS IF (0x60000000), IF = 5.74 MHz id: PAL/BG NICAM (0000000c00000007), size: 192
781 #
782
783 write_le32(0x60000000); # Type
784 write_le64(0x0000000c, 0x00000007); # ID
785 write_le16(5740); # IF
786 write_le32(192); # Size
787 write_hunk(305784, 192);
788
789 #
790 # Firmware 68, type: SCODE FW HAS IF (0x60000000), IF = 5.90 MHz id: (0000000000000000), size: 192
791 #
792
793 write_le32(0x60000000); # Type
794 write_le64(0x00000000, 0x00000000); # ID
795 write_le16(5900); # IF
796 write_le32(192); # Size
797 write_hunk(307512, 192);
798
799 #
800 # Firmware 69, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.00 MHz id: PAL/DK PAL/I SECAM/K3 SECAM/L SECAM/Lc NICAM (0000000c04c000f0), size: 192
801 #
802
803 write_le32(0x60008000); # Type
804 write_le64(0x0000000c, 0x04c000f0); # ID
805 write_le16(6000); # IF
806 write_le32(192); # Size
807 write_hunk(305576, 192);
808
809 #
810 # Firmware 70, type: SCODE FW DTV6 QAM ATSC LG60 F6MHZ HAS IF (0x68050060), IF = 6.20 MHz id: (0000000000000000), size: 192
811 #
812
813 write_le32(0x68050060); # Type
814 write_le64(0x00000000, 0x00000000); # ID
815 write_le16(6200); # IF
816 write_le32(192); # Size
817 write_hunk(306552, 192);
818
819 #
820 # Firmware 71, type: SCODE FW HAS IF (0x60000000), IF = 6.24 MHz id: PAL/I (0000000000000010), size: 192
821 #
822
823 write_le32(0x60000000); # Type
824 write_le64(0x00000000, 0x00000010); # ID
825 write_le16(6240); # IF
826 write_le32(192); # Size
827 write_hunk(305400, 192);
828
829 #
830 # Firmware 72, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.32 MHz id: SECAM/K1 (0000000000200000), size: 192
831 #
832
833 write_le32(0x60008000); # Type
834 write_le64(0x00000000, 0x00200000); # ID
835 write_le16(6320); # IF
836 write_le32(192); # Size
837 write_hunk(308472, 192);
838
839 #
840 # Firmware 73, type: SCODE FW HAS IF (0x60000000), IF = 6.34 MHz id: SECAM/K1 (0000000000200000), size: 192
841 #
842
843 write_le32(0x60000000); # Type
844 write_le64(0x00000000, 0x00200000); # ID
845 write_le16(6340); # IF
846 write_le32(192); # Size
847 write_hunk(306360, 192);
848
849 #
850 # Firmware 74, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.50 MHz id: PAL/DK SECAM/K3 SECAM/L NICAM (0000000c044000e0), size: 192
851 #
852
853 write_le32(0x60008000); # Type
854 write_le64(0x0000000c, 0x044000e0); # ID
855 write_le16(6500); # IF
856 write_le32(192); # Size
857 write_hunk(308280, 192);
858
859 #
860 # Firmware 75, type: SCODE FW DTV6 ATSC ATI638 HAS IF (0x60090020), IF = 6.58 MHz id: (0000000000000000), size: 192
861 #
862
863 write_le32(0x60090020); # Type
864 write_le64(0x00000000, 0x00000000); # ID
865 write_le16(6580); # IF
866 write_le32(192); # Size
867 write_hunk(304632, 192);
868
869 #
870 # Firmware 76, type: SCODE FW HAS IF (0x60000000), IF = 6.60 MHz id: PAL/DK A2 (00000003000000e0), size: 192
871 #
872
873 write_le32(0x60000000); # Type
874 write_le64(0x00000003, 0x000000e0); # ID
875 write_le16(6600); # IF
876 write_le32(192); # Size
877 write_hunk(306168, 192);
878
879 #
880 # Firmware 77, type: SCODE FW MONO HAS IF (0x60008000), IF = 6.68 MHz id: PAL/DK A2 (00000003000000e0), size: 192
881 #
882
883 write_le32(0x60008000); # Type
884 write_le64(0x00000003, 0x000000e0); # ID
885 write_le16(6680); # IF
886 write_le32(192); # Size
887 write_hunk(308088, 192);
888
889 #
890 # Firmware 78, type: SCODE FW DTV6 ATSC TOYOTA794 HAS IF (0x60810020), IF = 8.14 MHz id: (0000000000000000), size: 192
891 #
892
893 write_le32(0x60810020); # Type
894 write_le64(0x00000000, 0x00000000); # ID
895 write_le16(8140); # IF
896 write_le32(192); # Size
897 write_hunk(307320, 192);
898
899 #
900 # Firmware 79, type: SCODE FW HAS IF (0x60000000), IF = 8.20 MHz id: (0000000000000000), size: 192
901 #
902
903# write_le32(0x60000000); # Type
904# write_le64(0x00000000, 0x00000000); # ID
905# write_le16(8200); # IF
906# write_le32(192); # Size
907# write_hunk(308088, 192);
149} 908}
150 909
910sub main_firmware_27($$$$)
911{
912 my $out;
913 my $j=0;
914 my $outfile = shift;
915 my $name = shift;
916 my $version = shift;
917 my $nr_desc = shift;
918
919 for ($j = length($name); $j <32; $j++) {
920 $name = $name.chr(0);
921 }
922
151 open OUTFILE, ">$outfile"; 923 open OUTFILE, ">$outfile";
152 syswrite(OUTFILE, $name); 924 syswrite(OUTFILE, $name);
153 write_le16($version); 925 write_le16($version);
@@ -906,20 +1678,39 @@ sub main_firmware($$$$)
906 write_hunk(812856, 192); 1678 write_hunk(812856, 192);
907} 1679}
908 1680
1681
909sub extract_firmware { 1682sub extract_firmware {
910 my $sourcefile = "hcw85bda.sys"; 1683 my $sourcefile_24 = "UDXTTM6000.sys";
911 my $hash = "0e44dbf63bb0169d57446aec21881ff2"; 1684 my $hash_24 = "cb9deb5508a5e150af2880f5b0066d78";
912 my $outfile = "xc3028-v27.fw"; 1685 my $outfile_24 = "xc3028-v24.fw";
913 my $name = "xc2028 firmware"; 1686 my $name_24 = "xc2028 firmware";
914 my $version = 519; 1687 my $version_24 = 516;
915 my $nr_desc = 80; 1688 my $nr_desc_24 = 77;
1689 my $out;
1690
1691 my $sourcefile_27 = "hcw85bda.sys";
1692 my $hash_27 = "0e44dbf63bb0169d57446aec21881ff2";
1693 my $outfile_27 = "xc3028-v27.fw";
1694 my $name_27 = "xc2028 firmware";
1695 my $version_27 = 519;
1696 my $nr_desc_27 = 80;
916 my $out; 1697 my $out;
917 1698
918 verify($sourcefile, $hash); 1699 if (-e $sourcefile_24) {
1700 verify($sourcefile_24, $hash_24);
1701
1702 open INFILE, "<$sourcefile_24";
1703 main_firmware_24($outfile_24, $name_24, $version_24, $nr_desc_24);
1704 close INFILE;
1705 }
919 1706
920 open INFILE, "<$sourcefile"; 1707 if (-e $sourcefile_27) {
921 main_firmware($outfile, $name, $version, $nr_desc); 1708 verify($sourcefile_27, $hash_27);
922 close INFILE; 1709
1710 open INFILE, "<$sourcefile_27";
1711 main_firmware_27($outfile_27, $name_27, $version_27, $nr_desc_27);
1712 close INFILE;
1713 }
923} 1714}
924 1715
925extract_firmware; 1716extract_firmware;
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 1800a62cf135..8f3f5d33327c 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -42,6 +42,7 @@ ov519 041e:4064 Creative Live! VISTA VF0420
42ov519 041e:4067 Creative Live! Cam Video IM (VF0350) 42ov519 041e:4067 Creative Live! Cam Video IM (VF0350)
43ov519 041e:4068 Creative Live! VISTA VF0470 43ov519 041e:4068 Creative Live! VISTA VF0470
44spca561 0458:7004 Genius VideoCAM Express V2 44spca561 0458:7004 Genius VideoCAM Express V2
45sn9c2028 0458:7005 Genius Smart 300, version 2
45sunplus 0458:7006 Genius Dsc 1.3 Smart 46sunplus 0458:7006 Genius Dsc 1.3 Smart
46zc3xx 0458:7007 Genius VideoCam V2 47zc3xx 0458:7007 Genius VideoCam V2
47zc3xx 0458:700c Genius VideoCam V3 48zc3xx 0458:700c Genius VideoCam V3
@@ -49,6 +50,8 @@ zc3xx 0458:700f Genius VideoCam Web V2
49sonixj 0458:7025 Genius Eye 311Q 50sonixj 0458:7025 Genius Eye 311Q
50sn9c20x 0458:7029 Genius Look 320s 51sn9c20x 0458:7029 Genius Look 320s
51sonixj 0458:702e Genius Slim 310 NB 52sonixj 0458:702e Genius Slim 310 NB
53sn9c20x 0458:704a Genius Slim 1320
54sn9c20x 0458:704c Genius i-Look 1321
52sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650) 55sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650)
53sonixj 045e:00f5 MicroSoft VX3000 56sonixj 045e:00f5 MicroSoft VX3000
54sonixj 045e:00f7 MicroSoft VX1000 57sonixj 045e:00f7 MicroSoft VX1000
@@ -109,6 +112,7 @@ sunplus 04a5:3003 Benq DC 1300
109sunplus 04a5:3008 Benq DC 1500 112sunplus 04a5:3008 Benq DC 1500
110sunplus 04a5:300a Benq DC 3410 113sunplus 04a5:300a Benq DC 3410
111spca500 04a5:300c Benq DC 1016 114spca500 04a5:300c Benq DC 1016
115benq 04a5:3035 Benq DC E300
112finepix 04cb:0104 Fujifilm FinePix 4800 116finepix 04cb:0104 Fujifilm FinePix 4800
113finepix 04cb:0109 Fujifilm FinePix A202 117finepix 04cb:0109 Fujifilm FinePix A202
114finepix 04cb:010b Fujifilm FinePix A203 118finepix 04cb:010b Fujifilm FinePix A203
@@ -142,6 +146,7 @@ sunplus 04fc:5360 Sunplus Generic
142spca500 04fc:7333 PalmPixDC85 146spca500 04fc:7333 PalmPixDC85
143sunplus 04fc:ffff Pure DigitalDakota 147sunplus 04fc:ffff Pure DigitalDakota
144spca501 0506:00df 3Com HomeConnect Lite 148spca501 0506:00df 3Com HomeConnect Lite
149sunplus 052b:1507 Megapixel 5 Pretec DC-1007
145sunplus 052b:1513 Megapix V4 150sunplus 052b:1513 Megapix V4
146sunplus 052b:1803 MegaImage VI 151sunplus 052b:1803 MegaImage VI
147tv8532 0545:808b Veo Stingray 152tv8532 0545:808b Veo Stingray
@@ -151,6 +156,7 @@ sunplus 0546:3191 Polaroid Ion 80
151sunplus 0546:3273 Polaroid PDC2030 156sunplus 0546:3273 Polaroid PDC2030
152ov519 054c:0154 Sonny toy4 157ov519 054c:0154 Sonny toy4
153ov519 054c:0155 Sonny toy5 158ov519 054c:0155 Sonny toy5
159cpia1 0553:0002 CPIA CPiA (version1) based cameras
154zc3xx 055f:c005 Mustek Wcam300A 160zc3xx 055f:c005 Mustek Wcam300A
155spca500 055f:c200 Mustek Gsmart 300 161spca500 055f:c200 Mustek Gsmart 300
156sunplus 055f:c211 Kowa Bs888e Microcamera 162sunplus 055f:c211 Kowa Bs888e Microcamera
@@ -188,8 +194,7 @@ spca500 06bd:0404 Agfa CL20
188spca500 06be:0800 Optimedia 194spca500 06be:0800 Optimedia
189sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom 195sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom
190spca506 06e1:a190 ADS Instant VCD 196spca506 06e1:a190 ADS Instant VCD
191ov534 06f8:3002 Hercules Blog Webcam 197ov534_9 06f8:3003 Hercules Dualpix HD Weblog
192ov534 06f8:3003 Hercules Dualpix HD Weblog
193sonixj 06f8:3004 Hercules Classic Silver 198sonixj 06f8:3004 Hercules Classic Silver
194sonixj 06f8:3008 Hercules Deluxe Optical Glass 199sonixj 06f8:3008 Hercules Deluxe Optical Glass
195pac7302 06f8:3009 Hercules Classic Link 200pac7302 06f8:3009 Hercules Classic Link
@@ -204,6 +209,7 @@ sunplus 0733:2221 Mercury Digital Pro 3.1p
204sunplus 0733:3261 Concord 3045 spca536a 209sunplus 0733:3261 Concord 3045 spca536a
205sunplus 0733:3281 Cyberpix S550V 210sunplus 0733:3281 Cyberpix S550V
206spca506 0734:043b 3DeMon USB Capture aka 211spca506 0734:043b 3DeMon USB Capture aka
212cpia1 0813:0001 QX3 camera
207ov519 0813:0002 Dual Mode USB Camera Plus 213ov519 0813:0002 Dual Mode USB Camera Plus
208spca500 084d:0003 D-Link DSC-350 214spca500 084d:0003 D-Link DSC-350
209spca500 08ca:0103 Aiptek PocketDV 215spca500 08ca:0103 Aiptek PocketDV
@@ -225,7 +231,8 @@ sunplus 08ca:2050 Medion MD 41437
225sunplus 08ca:2060 Aiptek PocketDV5300 231sunplus 08ca:2060 Aiptek PocketDV5300
226tv8532 0923:010f ICM532 cams 232tv8532 0923:010f ICM532 cams
227mars 093a:050f Mars-Semi Pc-Camera 233mars 093a:050f Mars-Semi Pc-Camera
228mr97310a 093a:010f Sakar Digital no. 77379 234mr97310a 093a:010e All known CIF cams with this ID
235mr97310a 093a:010f All known VGA cams with this ID
229pac207 093a:2460 Qtec Webcam 100 236pac207 093a:2460 Qtec Webcam 100
230pac207 093a:2461 HP Webcam 237pac207 093a:2461 HP Webcam
231pac207 093a:2463 Philips SPC 220 NC 238pac207 093a:2463 Philips SPC 220 NC
@@ -300,11 +307,14 @@ sonixj 0c45:6138 Sn9c120 Mo4000
300sonixj 0c45:613a Microdia Sonix PC Camera 307sonixj 0c45:613a Microdia Sonix PC Camera
301sonixj 0c45:613b Surfer SN-206 308sonixj 0c45:613b Surfer SN-206
302sonixj 0c45:613c Sonix Pccam168 309sonixj 0c45:613c Sonix Pccam168
310sonixj 0c45:6142 Hama PC-Webcam AC-150
303sonixj 0c45:6143 Sonix Pccam168 311sonixj 0c45:6143 Sonix Pccam168
304sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia 312sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia
313sonixj 0c45:614a Frontech E-Ccam (JIL-2225)
305sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001) 314sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001)
306sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111) 315sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111)
307sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655) 316sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655)
317sn9c20x 0c45:624c PC Camera (SN9C201 + MT9M112)
308sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968) 318sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968)
309sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650) 319sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650)
310sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650) 320sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650)
@@ -317,6 +327,7 @@ sn9c20x 0c45:627f PC Camera (SN9C201 + OV9650)
317sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001) 327sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001)
318sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111) 328sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111)
319sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655) 329sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655)
330sn9c20x 0c45:628c PC Camera (SN9C201 + MT9M112)
320sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968) 331sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968)
321sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650) 332sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650)
322sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670) 333sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670)
@@ -324,6 +335,10 @@ sn9c20x 0c45:62b0 PC Camera (SN9C202 + MT9V011/MT9V111/MT9V112)
324sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655) 335sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655)
325sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660) 336sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660)
326sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R) 337sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R)
338sn9c2028 0c45:8001 Wild Planet Digital Spy Camera
339sn9c2028 0c45:8003 Sakar #11199, #6637x, #67480 keychain cams
340sn9c2028 0c45:8008 Mini-Shotz ms-350
341sn9c2028 0c45:800a Vivitar Vivicam 3350B
327sunplus 0d64:0303 Sunplus FashionCam DXG 342sunplus 0d64:0303 Sunplus FashionCam DXG
328ov519 0e96:c001 TRUST 380 USB2 SPACEC@M 343ov519 0e96:c001 TRUST 380 USB2 SPACEC@M
329etoms 102c:6151 Qcam Sangha CIF 344etoms 102c:6151 Qcam Sangha CIF
@@ -341,10 +356,11 @@ spca501 1776:501c Arowana 300K CMOS Camera
341t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops 356t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops
342vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC 357vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC
343pac207 2001:f115 D-Link DSB-C120 358pac207 2001:f115 D-Link DSB-C120
344sq905c 2770:9050 sq905c 359sq905c 2770:9050 Disney pix micro (CIF)
345sq905c 2770:905c DualCamera 360sq905c 2770:9052 Disney pix micro 2 (VGA)
346sq905 2770:9120 Argus Digital Camera DC1512 361sq905c 2770:905c All 11 known cameras with this ID
347sq905c 2770:913d sq905c 362sq905 2770:9120 All 24 known cameras with this ID
363sq905c 2770:913d All 4 known cameras with this ID
348spca500 2899:012c Toptro Industrial 364spca500 2899:012c Toptro Industrial
349ov519 8020:ef04 ov519 365ov519 8020:ef04 ov519
350spca508 8086:0110 Intel Easy PC Camera 366spca508 8086:0110 Intel Easy PC Camera
diff --git a/Documentation/video4linux/sh_mobile_ceu_camera.txt b/Documentation/video4linux/sh_mobile_ceu_camera.txt
index 2ae16349a78d..cb47e723af74 100644
--- a/Documentation/video4linux/sh_mobile_ceu_camera.txt
+++ b/Documentation/video4linux/sh_mobile_ceu_camera.txt
@@ -17,18 +17,18 @@ Generic scaling / cropping scheme
17-2-- -\ 17-2-- -\
18| --\ 18| --\
19| --\ 19| --\
20+-5-- -\ -- -3-- 20+-5-- . -- -3-- -\
21| ---\ 21| `... -\
22| --- -4-- -\ 22| `... -4-- . - -7..
23| -\ 23| `.
24| - -6-- 24| `. .6--
25| 25|
26| - -6'- 26| . .6'-
27| -/ 27| .´
28| --- -4'- -/ 28| ... -4'- .´
29| ---/ 29| ...´ - -7'.
30+-5'- -/ 30+-5'- .´ -/
31| -- -3'- 31| -- -3'- -/
32| --/ 32| --/
33| --/ 33| --/
34-2'- -/ 34-2'- -/
@@ -36,7 +36,11 @@ Generic scaling / cropping scheme
36| 36|
37-1'- 37-1'-
38 38
39Produced by user requests: 39In the above chart minuses and slashes represent "real" data amounts, points and
40accents represent "useful" data, basically, CEU scaled amd cropped output,
41mapped back onto the client's source plane.
42
43Such a configuration can be produced by user requests:
40 44
41S_CROP(left / top = (5) - (1), width / height = (5') - (5)) 45S_CROP(left / top = (5) - (1), width / height = (5') - (5))
42S_FMT(width / height = (6') - (6)) 46S_FMT(width / height = (6') - (6))
@@ -106,52 +110,30 @@ window:
106S_CROP 110S_CROP
107------ 111------
108 112
109If old scale applied to new crop is invalid produce nearest new scale possible 113The API at http://v4l2spec.bytesex.org/spec/x1904.htm says:
110
1111. Calculate current combined scales.
112
113 scale_comb = (((4') - (4)) / ((6') - (6))) * (((2') - (2)) / ((3') - (3)))
114
1152. Apply iterative sensor S_CROP for new input window.
116
1173. If old combined scales applied to new crop produce an impossible user window,
118adjust scales to produce nearest possible window.
119
120 width_u_out = ((5') - (5)) / scale_comb
121 114
122 if (width_u_out > max) 115"...specification does not define an origin or units. However by convention
123 scale_comb = ((5') - (5)) / max; 116drivers should horizontally count unscaled samples relative to 0H."
124 else if (width_u_out < min)
125 scale_comb = ((5') - (5)) / min;
126 117
1274. Issue G_CROP to retrieve actual input window. 118We choose to follow the advise and interpret cropping units as client input
119pixels.
128 120
1295. Using actual input window and calculated combined scales calculate sensor 121Cropping is performed in the following 6 steps:
130target output window.
131
132 width_s_out = ((3') - (3)) = ((2') - (2)) / scale_comb
133
1346. Apply iterative S_FMT for new sensor target output window.
135
1367. Issue G_FMT to retrieve the actual sensor output window.
137
1388. Calculate sensor scales.
139
140 scale_s = ((3') - (3)) / ((2') - (2))
141 122
1429. Calculate sensor output subwindow to be cropped on CEU by applying sensor 1231. Request exactly user rectangle from the sensor.
143scales to the requested window.
144 124
145 width_ceu = ((5') - (5)) / scale_s 1252. If smaller - iterate until a larger one is obtained. Result: sensor cropped
126 to 2 : 2', target crop 5 : 5', current output format 6' - 6.
146 127
14710. Use CEU cropping for above calculated window. 1283. In the previous step the sensor has tried to preserve its output frame as
129 good as possible, but it could have changed. Retrieve it again.
148 130
14911. Calculate CEU scales from sensor scales from results of (10) and user window 1314. Sensor scaled to 3 : 3'. Sensor's scale is (2' - 2) / (3' - 3). Calculate
150from (3) 132 intermediate window: 4' - 4 = (5' - 5) * (3' - 3) / (2' - 2)
151 133
152 scale_ceu = calc_scale(((5') - (5)), &width_u_out) 1345. Calculate and apply host scale = (6' - 6) / (4' - 4)
153 135
15412. Apply CEU scales. 1366. Calculate and apply host crop: 6 - 7 = (5 - 2) * (6' - 6) / (5' - 5)
155 137
156-- 138--
157Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de> 139Author: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 74d677c8b036..e831aaca66f8 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -545,12 +545,11 @@ unregister them:
545This will remove the device nodes from sysfs (causing udev to remove them 545This will remove the device nodes from sysfs (causing udev to remove them
546from /dev). 546from /dev).
547 547
548After video_unregister_device() returns no new opens can be done. 548After video_unregister_device() returns no new opens can be done. However,
549 549in the case of USB devices some application might still have one of these
550However, in the case of USB devices some application might still have one 550device nodes open. So after the unregister all file operations will return
551of these device nodes open. You should block all new accesses to read, 551an error as well, except for the ioctl and unlocked_ioctl file operations:
552write, poll, etc. except possibly for certain ioctl operations like 552those will still be passed on since some buffer ioctls may still be needed.
553queueing buffers.
554 553
555When the last user of the video device node exits, then the vdev->release() 554When the last user of the video device node exits, then the vdev->release()
556callback is called and you can do the final cleanup there. 555callback is called and you can do the final cleanup there.
@@ -599,99 +598,145 @@ video_device::minor fields.
599video buffer helper functions 598video buffer helper functions
600----------------------------- 599-----------------------------
601 600
602The v4l2 core API provides a standard method for dealing with video 601The v4l2 core API provides a set of standard methods (called "videobuf")
603buffers. Those methods allow a driver to implement read(), mmap() and 602for dealing with video buffers. Those methods allow a driver to implement
604overlay() on a consistent way. 603read(), mmap() and overlay() in a consistent way. There are currently
605 604methods for using video buffers on devices that supports DMA with
606There are currently methods for using video buffers on devices that 605scatter/gather method (videobuf-dma-sg), DMA with linear access
607supports DMA with scatter/gather method (videobuf-dma-sg), DMA with 606(videobuf-dma-contig), and vmalloced buffers, mostly used on USB drivers
608linear access (videobuf-dma-contig), and vmalloced buffers, mostly 607(videobuf-vmalloc).
609used on USB drivers (videobuf-vmalloc). 608
610 609Please see Documentation/video4linux/videobuf for more information on how
611Any driver using videobuf should provide operations (callbacks) for 610to use the videobuf layer.
612four handlers: 611
613 612struct v4l2_fh
614ops->buf_setup - calculates the size of the video buffers and avoid they 613--------------
615 to waste more than some maximum limit of RAM; 614
616ops->buf_prepare - fills the video buffer structs and calls 615struct v4l2_fh provides a way to easily keep file handle specific data
617 videobuf_iolock() to alloc and prepare mmaped memory; 616that is used by the V4L2 framework. Using v4l2_fh is optional for
618ops->buf_queue - advices the driver that another buffer were 617drivers.
619 requested (by read() or by QBUF); 618
620ops->buf_release - frees any buffer that were allocated. 619The users of v4l2_fh (in the V4L2 framework, not the driver) know
621 620whether a driver uses v4l2_fh as its file->private_data pointer by
622In order to use it, the driver need to have a code (generally called at 621testing the V4L2_FL_USES_V4L2_FH bit in video_device->flags.
623interrupt context) that will properly handle the buffer request lists, 622
624announcing that a new buffer were filled. 623Useful functions:
625 624
626The irq handling code should handle the videobuf task lists, in order 625- v4l2_fh_init()
627to advice videobuf that a new frame were filled, in order to honor to a 626
628request. The code is generally like this one: 627 Initialise the file handle. This *MUST* be performed in the driver's
629 if (list_empty(&dma_q->active)) 628 v4l2_file_operations->open() handler.
630 return; 629
631 630- v4l2_fh_add()
632 buf = list_entry(dma_q->active.next, struct vbuffer, vb.queue); 631
633 632 Add a v4l2_fh to video_device file handle list. May be called after
634 if (!waitqueue_active(&buf->vb.done)) 633 initialising the file handle.
635 return; 634
636 635- v4l2_fh_del()
637 /* Some logic to handle the buf may be needed here */ 636
638 637 Unassociate the file handle from video_device(). The file handle
639 list_del(&buf->vb.queue); 638 exit function may now be called.
640 do_gettimeofday(&buf->vb.ts); 639
641 wake_up(&buf->vb.done); 640- v4l2_fh_exit()
642 641
643Those are the videobuffer functions used on drivers, implemented on 642 Uninitialise the file handle. After uninitialisation the v4l2_fh
644videobuf-core: 643 memory can be freed.
645 644
646- Videobuf init functions 645struct v4l2_fh is allocated as a part of the driver's own file handle
647 videobuf_queue_sg_init() 646structure and is set to file->private_data in the driver's open
648 Initializes the videobuf infrastructure. This function should be 647function by the driver. Drivers can extract their own file handle
649 called before any other videobuf function on drivers that uses DMA 648structure by using the container_of macro. Example:
650 Scatter/Gather buffers. 649
651 650struct my_fh {
652 videobuf_queue_dma_contig_init 651 int blah;
653 Initializes the videobuf infrastructure. This function should be 652 struct v4l2_fh fh;
654 called before any other videobuf function on drivers that need DMA 653};
655 contiguous buffers. 654
656 655...
657 videobuf_queue_vmalloc_init() 656
658 Initializes the videobuf infrastructure. This function should be 657int my_open(struct file *file)
659 called before any other videobuf function on USB (and other drivers) 658{
660 that need a vmalloced type of videobuf. 659 struct my_fh *my_fh;
661 660 struct video_device *vfd;
662- videobuf_iolock() 661 int ret;
663 Prepares the videobuf memory for the proper method (read, mmap, overlay). 662
664 663 ...
665- videobuf_queue_is_busy() 664
666 Checks if a videobuf is streaming. 665 ret = v4l2_fh_init(&my_fh->fh, vfd);
667 666 if (ret)
668- videobuf_queue_cancel() 667 return ret;
669 Stops video handling. 668
670 669 v4l2_fh_add(&my_fh->fh);
671- videobuf_mmap_free() 670
672 frees mmap buffers. 671 file->private_data = &my_fh->fh;
673 672
674- videobuf_stop() 673 ...
675 Stops video handling, ends mmap and frees mmap and other buffers. 674}
676 675
677- V4L2 api functions. Those functions correspond to VIDIOC_foo ioctls: 676int my_release(struct file *file)
678 videobuf_reqbufs(), videobuf_querybuf(), videobuf_qbuf(), 677{
679 videobuf_dqbuf(), videobuf_streamon(), videobuf_streamoff(). 678 struct v4l2_fh *fh = file->private_data;
680 679 struct my_fh *my_fh = container_of(fh, struct my_fh, fh);
681- V4L1 api function (corresponds to VIDIOCMBUF ioctl): 680
682 videobuf_cgmbuf() 681 ...
683 This function is used to provide backward compatibility with V4L1 682}
684 API. 683
685 684V4L2 events
686- Some help functions for read()/poll() operations: 685-----------
687 videobuf_read_stream() 686
688 For continuous stream read() 687The V4L2 events provide a generic way to pass events to user space.
689 videobuf_read_one() 688The driver must use v4l2_fh to be able to support V4L2 events.
690 For snapshot read() 689
691 videobuf_poll_stream() 690Useful functions:
692 polling help function 691
693 692- v4l2_event_alloc()
694The better way to understand it is to take a look at vivi driver. One 693
695of the main reasons for vivi is to be a videobuf usage example. the 694 To use events, the driver must allocate events for the file handle. By
696vivi_thread_tick() does the task that the IRQ callback would do on PCI 695 calling the function more than once, the driver may assure that at least n
697drivers (or the irq callback on USB). 696 events in total have been allocated. The function may not be called in
697 atomic context.
698
699- v4l2_event_queue()
700
701 Queue events to video device. The driver's only responsibility is to fill
702 in the type and the data fields. The other fields will be filled in by
703 V4L2.
704
705- v4l2_event_subscribe()
706
707 The video_device->ioctl_ops->vidioc_subscribe_event must check the driver
708 is able to produce events with specified event id. Then it calls
709 v4l2_event_subscribe() to subscribe the event.
710
711- v4l2_event_unsubscribe()
712
713 vidioc_unsubscribe_event in struct v4l2_ioctl_ops. A driver may use
714 v4l2_event_unsubscribe() directly unless it wants to be involved in
715 unsubscription process.
716
717 The special type V4L2_EVENT_ALL may be used to unsubscribe all events. The
718 drivers may want to handle this in a special way.
719
720- v4l2_event_pending()
721
722 Returns the number of pending events. Useful when implementing poll.
723
724Drivers do not initialise events directly. The events are initialised
725through v4l2_fh_init() if video_device->ioctl_ops->vidioc_subscribe_event is
726non-NULL. This *MUST* be performed in the driver's
727v4l2_file_operations->open() handler.
728
729Events are delivered to user space through the poll system call. The driver
730can use v4l2_fh->events->wait wait_queue_head_t as the argument for
731poll_wait().
732
733There are standard and private events. New standard events must use the
734smallest available event type. The drivers must allocate their events from
735their own class starting from class base. Class base is
736V4L2_EVENT_PRIVATE_START + n * 1000 where n is the lowest available number.
737The first event type in the class is reserved for future use, so the first
738available event type is 'class base + 1'.
739
740An example on how the V4L2 events may be used can be found in the OMAP
7413 ISP driver available at <URL:http://gitorious.org/omap3camera> as of
742writing this.
diff --git a/Documentation/video4linux/videobuf b/Documentation/video4linux/videobuf
new file mode 100644
index 000000000000..17a1f9abf260
--- /dev/null
+++ b/Documentation/video4linux/videobuf
@@ -0,0 +1,360 @@
1An introduction to the videobuf layer
2Jonathan Corbet <corbet@lwn.net>
3Current as of 2.6.33
4
5The videobuf layer functions as a sort of glue layer between a V4L2 driver
6and user space. It handles the allocation and management of buffers for
7the storage of video frames. There is a set of functions which can be used
8to implement many of the standard POSIX I/O system calls, including read(),
9poll(), and, happily, mmap(). Another set of functions can be used to
10implement the bulk of the V4L2 ioctl() calls related to streaming I/O,
11including buffer allocation, queueing and dequeueing, and streaming
12control. Using videobuf imposes a few design decisions on the driver
13author, but the payback comes in the form of reduced code in the driver and
14a consistent implementation of the V4L2 user-space API.
15
16Buffer types
17
18Not all video devices use the same kind of buffers. In fact, there are (at
19least) three common variations:
20
21 - Buffers which are scattered in both the physical and (kernel) virtual
22 address spaces. (Almost) all user-space buffers are like this, but it
23 makes great sense to allocate kernel-space buffers this way as well when
24 it is possible. Unfortunately, it is not always possible; working with
25 this kind of buffer normally requires hardware which can do
26 scatter/gather DMA operations.
27
28 - Buffers which are physically scattered, but which are virtually
29 contiguous; buffers allocated with vmalloc(), in other words. These
30 buffers are just as hard to use for DMA operations, but they can be
31 useful in situations where DMA is not available but virtually-contiguous
32 buffers are convenient.
33
34 - Buffers which are physically contiguous. Allocation of this kind of
35 buffer can be unreliable on fragmented systems, but simpler DMA
36 controllers cannot deal with anything else.
37
38Videobuf can work with all three types of buffers, but the driver author
39must pick one at the outset and design the driver around that decision.
40
41[It's worth noting that there's a fourth kind of buffer: "overlay" buffers
42which are located within the system's video memory. The overlay
43functionality is considered to be deprecated for most use, but it still
44shows up occasionally in system-on-chip drivers where the performance
45benefits merit the use of this technique. Overlay buffers can be handled
46as a form of scattered buffer, but there are very few implementations in
47the kernel and a description of this technique is currently beyond the
48scope of this document.]
49
50Data structures, callbacks, and initialization
51
52Depending on which type of buffers are being used, the driver should
53include one of the following files:
54
55 <media/videobuf-dma-sg.h> /* Physically scattered */
56 <media/videobuf-vmalloc.h> /* vmalloc() buffers */
57 <media/videobuf-dma-contig.h> /* Physically contiguous */
58
59The driver's data structure describing a V4L2 device should include a
60struct videobuf_queue instance for the management of the buffer queue,
61along with a list_head for the queue of available buffers. There will also
62need to be an interrupt-safe spinlock which is used to protect (at least)
63the queue.
64
65The next step is to write four simple callbacks to help videobuf deal with
66the management of buffers:
67
68 struct videobuf_queue_ops {
69 int (*buf_setup)(struct videobuf_queue *q,
70 unsigned int *count, unsigned int *size);
71 int (*buf_prepare)(struct videobuf_queue *q,
72 struct videobuf_buffer *vb,
73 enum v4l2_field field);
74 void (*buf_queue)(struct videobuf_queue *q,
75 struct videobuf_buffer *vb);
76 void (*buf_release)(struct videobuf_queue *q,
77 struct videobuf_buffer *vb);
78 };
79
80buf_setup() is called early in the I/O process, when streaming is being
81initiated; its purpose is to tell videobuf about the I/O stream. The count
82parameter will be a suggested number of buffers to use; the driver should
83check it for rationality and adjust it if need be. As a practical rule, a
84minimum of two buffers are needed for proper streaming, and there is
85usually a maximum (which cannot exceed 32) which makes sense for each
86device. The size parameter should be set to the expected (maximum) size
87for each frame of data.
88
89Each buffer (in the form of a struct videobuf_buffer pointer) will be
90passed to buf_prepare(), which should set the buffer's size, width, height,
91and field fields properly. If the buffer's state field is
92VIDEOBUF_NEEDS_INIT, the driver should pass it to:
93
94 int videobuf_iolock(struct videobuf_queue* q, struct videobuf_buffer *vb,
95 struct v4l2_framebuffer *fbuf);
96
97Among other things, this call will usually allocate memory for the buffer.
98Finally, the buf_prepare() function should set the buffer's state to
99VIDEOBUF_PREPARED.
100
101When a buffer is queued for I/O, it is passed to buf_queue(), which should
102put it onto the driver's list of available buffers and set its state to
103VIDEOBUF_QUEUED. Note that this function is called with the queue spinlock
104held; if it tries to acquire it as well things will come to a screeching
105halt. Yes, this is the voice of experience. Note also that videobuf may
106wait on the first buffer in the queue; placing other buffers in front of it
107could again gum up the works. So use list_add_tail() to enqueue buffers.
108
109Finally, buf_release() is called when a buffer is no longer intended to be
110used. The driver should ensure that there is no I/O active on the buffer,
111then pass it to the appropriate free routine(s):
112
113 /* Scatter/gather drivers */
114 int videobuf_dma_unmap(struct videobuf_queue *q,
115 struct videobuf_dmabuf *dma);
116 int videobuf_dma_free(struct videobuf_dmabuf *dma);
117
118 /* vmalloc drivers */
119 void videobuf_vmalloc_free (struct videobuf_buffer *buf);
120
121 /* Contiguous drivers */
122 void videobuf_dma_contig_free(struct videobuf_queue *q,
123 struct videobuf_buffer *buf);
124
125One way to ensure that a buffer is no longer under I/O is to pass it to:
126
127 int videobuf_waiton(struct videobuf_buffer *vb, int non_blocking, int intr);
128
129Here, vb is the buffer, non_blocking indicates whether non-blocking I/O
130should be used (it should be zero in the buf_release() case), and intr
131controls whether an interruptible wait is used.
132
133File operations
134
135At this point, much of the work is done; much of the rest is slipping
136videobuf calls into the implementation of the other driver callbacks. The
137first step is in the open() function, which must initialize the
138videobuf queue. The function to use depends on the type of buffer used:
139
140 void videobuf_queue_sg_init(struct videobuf_queue *q,
141 struct videobuf_queue_ops *ops,
142 struct device *dev,
143 spinlock_t *irqlock,
144 enum v4l2_buf_type type,
145 enum v4l2_field field,
146 unsigned int msize,
147 void *priv);
148
149 void videobuf_queue_vmalloc_init(struct videobuf_queue *q,
150 struct videobuf_queue_ops *ops,
151 struct device *dev,
152 spinlock_t *irqlock,
153 enum v4l2_buf_type type,
154 enum v4l2_field field,
155 unsigned int msize,
156 void *priv);
157
158 void videobuf_queue_dma_contig_init(struct videobuf_queue *q,
159 struct videobuf_queue_ops *ops,
160 struct device *dev,
161 spinlock_t *irqlock,
162 enum v4l2_buf_type type,
163 enum v4l2_field field,
164 unsigned int msize,
165 void *priv);
166
167In each case, the parameters are the same: q is the queue structure for the
168device, ops is the set of callbacks as described above, dev is the device
169structure for this video device, irqlock is an interrupt-safe spinlock to
170protect access to the data structures, type is the buffer type used by the
171device (cameras will use V4L2_BUF_TYPE_VIDEO_CAPTURE, for example), field
172describes which field is being captured (often V4L2_FIELD_NONE for
173progressive devices), msize is the size of any containing structure used
174around struct videobuf_buffer, and priv is a private data pointer which
175shows up in the priv_data field of struct videobuf_queue. Note that these
176are void functions which, evidently, are immune to failure.
177
178V4L2 capture drivers can be written to support either of two APIs: the
179read() system call and the rather more complicated streaming mechanism. As
180a general rule, it is necessary to support both to ensure that all
181applications have a chance of working with the device. Videobuf makes it
182easy to do that with the same code. To implement read(), the driver need
183only make a call to one of:
184
185 ssize_t videobuf_read_one(struct videobuf_queue *q,
186 char __user *data, size_t count,
187 loff_t *ppos, int nonblocking);
188
189 ssize_t videobuf_read_stream(struct videobuf_queue *q,
190 char __user *data, size_t count,
191 loff_t *ppos, int vbihack, int nonblocking);
192
193Either one of these functions will read frame data into data, returning the
194amount actually read; the difference is that videobuf_read_one() will only
195read a single frame, while videobuf_read_stream() will read multiple frames
196if they are needed to satisfy the count requested by the application. A
197typical driver read() implementation will start the capture engine, call
198one of the above functions, then stop the engine before returning (though a
199smarter implementation might leave the engine running for a little while in
200anticipation of another read() call happening in the near future).
201
202The poll() function can usually be implemented with a direct call to:
203
204 unsigned int videobuf_poll_stream(struct file *file,
205 struct videobuf_queue *q,
206 poll_table *wait);
207
208Note that the actual wait queue eventually used will be the one associated
209with the first available buffer.
210
211When streaming I/O is done to kernel-space buffers, the driver must support
212the mmap() system call to enable user space to access the data. In many
213V4L2 drivers, the often-complex mmap() implementation simplifies to a
214single call to:
215
216 int videobuf_mmap_mapper(struct videobuf_queue *q,
217 struct vm_area_struct *vma);
218
219Everything else is handled by the videobuf code.
220
221The release() function requires two separate videobuf calls:
222
223 void videobuf_stop(struct videobuf_queue *q);
224 int videobuf_mmap_free(struct videobuf_queue *q);
225
226The call to videobuf_stop() terminates any I/O in progress - though it is
227still up to the driver to stop the capture engine. The call to
228videobuf_mmap_free() will ensure that all buffers have been unmapped; if
229so, they will all be passed to the buf_release() callback. If buffers
230remain mapped, videobuf_mmap_free() returns an error code instead. The
231purpose is clearly to cause the closing of the file descriptor to fail if
232buffers are still mapped, but every driver in the 2.6.32 kernel cheerfully
233ignores its return value.
234
235ioctl() operations
236
237The V4L2 API includes a very long list of driver callbacks to respond to
238the many ioctl() commands made available to user space. A number of these
239- those associated with streaming I/O - turn almost directly into videobuf
240calls. The relevant helper functions are:
241
242 int videobuf_reqbufs(struct videobuf_queue *q,
243 struct v4l2_requestbuffers *req);
244 int videobuf_querybuf(struct videobuf_queue *q, struct v4l2_buffer *b);
245 int videobuf_qbuf(struct videobuf_queue *q, struct v4l2_buffer *b);
246 int videobuf_dqbuf(struct videobuf_queue *q, struct v4l2_buffer *b,
247 int nonblocking);
248 int videobuf_streamon(struct videobuf_queue *q);
249 int videobuf_streamoff(struct videobuf_queue *q);
250 int videobuf_cgmbuf(struct videobuf_queue *q, struct video_mbuf *mbuf,
251 int count);
252
253So, for example, a VIDIOC_REQBUFS call turns into a call to the driver's
254vidioc_reqbufs() callback which, in turn, usually only needs to locate the
255proper struct videobuf_queue pointer and pass it to videobuf_reqbufs().
256These support functions can replace a great deal of buffer management
257boilerplate in a lot of V4L2 drivers.
258
259The vidioc_streamon() and vidioc_streamoff() functions will be a bit more
260complex, of course, since they will also need to deal with starting and
261stopping the capture engine. videobuf_cgmbuf(), called from the driver's
262vidiocgmbuf() function, only exists if the V4L1 compatibility module has
263been selected with CONFIG_VIDEO_V4L1_COMPAT, so its use must be surrounded
264with #ifdef directives.
265
266Buffer allocation
267
268Thus far, we have talked about buffers, but have not looked at how they are
269allocated. The scatter/gather case is the most complex on this front. For
270allocation, the driver can leave buffer allocation entirely up to the
271videobuf layer; in this case, buffers will be allocated as anonymous
272user-space pages and will be very scattered indeed. If the application is
273using user-space buffers, no allocation is needed; the videobuf layer will
274take care of calling get_user_pages() and filling in the scatterlist array.
275
276If the driver needs to do its own memory allocation, it should be done in
277the vidioc_reqbufs() function, *after* calling videobuf_reqbufs(). The
278first step is a call to:
279
280 struct videobuf_dmabuf *videobuf_to_dma(struct videobuf_buffer *buf);
281
282The returned videobuf_dmabuf structure (defined in
283<media/videobuf-dma-sg.h>) includes a couple of relevant fields:
284
285 struct scatterlist *sglist;
286 int sglen;
287
288The driver must allocate an appropriately-sized scatterlist array and
289populate it with pointers to the pieces of the allocated buffer; sglen
290should be set to the length of the array.
291
292Drivers using the vmalloc() method need not (and cannot) concern themselves
293with buffer allocation at all; videobuf will handle those details. The
294same is normally true of contiguous-DMA drivers as well; videobuf will
295allocate the buffers (with dma_alloc_coherent()) when it sees fit. That
296means that these drivers may be trying to do high-order allocations at any
297time, an operation which is not always guaranteed to work. Some drivers
298play tricks by allocating DMA space at system boot time; videobuf does not
299currently play well with those drivers.
300
301As of 2.6.31, contiguous-DMA drivers can work with a user-supplied buffer,
302as long as that buffer is physically contiguous. Normal user-space
303allocations will not meet that criterion, but buffers obtained from other
304kernel drivers, or those contained within huge pages, will work with these
305drivers.
306
307Filling the buffers
308
309The final part of a videobuf implementation has no direct callback - it's
310the portion of the code which actually puts frame data into the buffers,
311usually in response to interrupts from the device. For all types of
312drivers, this process works approximately as follows:
313
314 - Obtain the next available buffer and make sure that somebody is actually
315 waiting for it.
316
317 - Get a pointer to the memory and put video data there.
318
319 - Mark the buffer as done and wake up the process waiting for it.
320
321Step (1) above is done by looking at the driver-managed list_head structure
322- the one which is filled in the buf_queue() callback. Because starting
323the engine and enqueueing buffers are done in separate steps, it's possible
324for the engine to be running without any buffers available - in the
325vmalloc() case especially. So the driver should be prepared for the list
326to be empty. It is equally possible that nobody is yet interested in the
327buffer; the driver should not remove it from the list or fill it until a
328process is waiting on it. That test can be done by examining the buffer's
329done field (a wait_queue_head_t structure) with waitqueue_active().
330
331A buffer's state should be set to VIDEOBUF_ACTIVE before being mapped for
332DMA; that ensures that the videobuf layer will not try to do anything with
333it while the device is transferring data.
334
335For scatter/gather drivers, the needed memory pointers will be found in the
336scatterlist structure described above. Drivers using the vmalloc() method
337can get a memory pointer with:
338
339 void *videobuf_to_vmalloc(struct videobuf_buffer *buf);
340
341For contiguous DMA drivers, the function to use is:
342
343 dma_addr_t videobuf_to_dma_contig(struct videobuf_buffer *buf);
344
345The contiguous DMA API goes out of its way to hide the kernel-space address
346of the DMA buffer from drivers.
347
348The final step is to set the size field of the relevant videobuf_buffer
349structure to the actual size of the captured image, set state to
350VIDEOBUF_DONE, then call wake_up() on the done queue. At this point, the
351buffer is owned by the videobuf layer and the driver should not touch it
352again.
353
354Developers who are interested in more information can go into the relevant
355header files; there are a few low-level functions declared there which have
356not been talked about here. Also worthwhile is the vivi driver
357(drivers/media/video/vivi.c), which is maintained as an example of how V4L2
358drivers should be written. Vivi only uses the vmalloc() API, but it's good
359enough to get started with. Note also that all of these calls are exported
360GPL-only, so they will not be available to non-GPL kernel modules.
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index e57d6a9dd32b..dca82d7c83d8 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -4,23 +4,35 @@ active_mm.txt
4 - An explanation from Linus about tsk->active_mm vs tsk->mm. 4 - An explanation from Linus about tsk->active_mm vs tsk->mm.
5balance 5balance
6 - various information on memory balancing. 6 - various information on memory balancing.
7hugepage-mmap.c
8 - Example app using huge page memory with the mmap system call.
9hugepage-shm.c
10 - Example app using huge page memory with Sys V shared memory system calls.
7hugetlbpage.txt 11hugetlbpage.txt
8 - a brief summary of hugetlbpage support in the Linux kernel. 12 - a brief summary of hugetlbpage support in the Linux kernel.
13hwpoison.txt
14 - explains what hwpoison is
9ksm.txt 15ksm.txt
10 - how to use the Kernel Samepage Merging feature. 16 - how to use the Kernel Samepage Merging feature.
11locking 17locking
12 - info on how locking and synchronization is done in the Linux vm code. 18 - info on how locking and synchronization is done in the Linux vm code.
19map_hugetlb.c
20 - an example program that uses the MAP_HUGETLB mmap flag.
13numa 21numa
14 - information about NUMA specific code in the Linux vm. 22 - information about NUMA specific code in the Linux vm.
15numa_memory_policy.txt 23numa_memory_policy.txt
16 - documentation of concepts and APIs of the 2.6 memory policy support. 24 - documentation of concepts and APIs of the 2.6 memory policy support.
17overcommit-accounting 25overcommit-accounting
18 - description of the Linux kernels overcommit handling modes. 26 - description of the Linux kernels overcommit handling modes.
27page-types.c
28 - Tool for querying page flags
19page_migration 29page_migration
20 - description of page migration in NUMA systems. 30 - description of page migration in NUMA systems.
31pagemap.txt
32 - pagemap, from the userspace perspective
21slabinfo.c 33slabinfo.c
22 - source code for a tool to get reports about slabs. 34 - source code for a tool to get reports about slabs.
23slub.txt 35slub.txt
24 - a short users guide for SLUB. 36 - a short users guide for SLUB.
25map_hugetlb.c 37unevictable-lru.txt
26 - an example program that uses the MAP_HUGETLB mmap flag. 38 - Unevictable LRU infrastructure
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 5bd269b3731a..9dcff328b964 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
2obj- := dummy.o 2obj- := dummy.o
3 3
4# List of programs to build 4# List of programs to build
5hostprogs-y := slabinfo page-types 5hostprogs-y := slabinfo page-types hugepage-mmap hugepage-shm map_hugetlb
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
diff --git a/Documentation/vm/hugepage-mmap.c b/Documentation/vm/hugepage-mmap.c
new file mode 100644
index 000000000000..db0dd9a33d54
--- /dev/null
+++ b/Documentation/vm/hugepage-mmap.c
@@ -0,0 +1,91 @@
1/*
2 * hugepage-mmap:
3 *
4 * Example of using huge page memory in a user application using the mmap
5 * system call. Before running this application, make sure that the
6 * administrator has mounted the hugetlbfs filesystem (on some directory
7 * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
8 * example, the app is requesting memory of size 256MB that is backed by
9 * huge pages.
10 *
11 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
12 * huge pages. That means that if one requires a fixed address, a huge page
13 * aligned address starting with 0x800000... will be required. If a fixed
14 * address is not required, the kernel will select an address in the proper
15 * range.
16 * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <unistd.h>
22#include <sys/mman.h>
23#include <fcntl.h>
24
25#define FILE_NAME "/mnt/hugepagefile"
26#define LENGTH (256UL*1024*1024)
27#define PROTECTION (PROT_READ | PROT_WRITE)
28
29/* Only ia64 requires this */
30#ifdef __ia64__
31#define ADDR (void *)(0x8000000000000000UL)
32#define FLAGS (MAP_SHARED | MAP_FIXED)
33#else
34#define ADDR (void *)(0x0UL)
35#define FLAGS (MAP_SHARED)
36#endif
37
38static void check_bytes(char *addr)
39{
40 printf("First hex is %x\n", *((unsigned int *)addr));
41}
42
43static void write_bytes(char *addr)
44{
45 unsigned long i;
46
47 for (i = 0; i < LENGTH; i++)
48 *(addr + i) = (char)i;
49}
50
51static void read_bytes(char *addr)
52{
53 unsigned long i;
54
55 check_bytes(addr);
56 for (i = 0; i < LENGTH; i++)
57 if (*(addr + i) != (char)i) {
58 printf("Mismatch at %lu\n", i);
59 break;
60 }
61}
62
63int main(void)
64{
65 void *addr;
66 int fd;
67
68 fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
69 if (fd < 0) {
70 perror("Open failed");
71 exit(1);
72 }
73
74 addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
75 if (addr == MAP_FAILED) {
76 perror("mmap");
77 unlink(FILE_NAME);
78 exit(1);
79 }
80
81 printf("Returned address is %p\n", addr);
82 check_bytes(addr);
83 write_bytes(addr);
84 read_bytes(addr);
85
86 munmap(addr, LENGTH);
87 close(fd);
88 unlink(FILE_NAME);
89
90 return 0;
91}
diff --git a/Documentation/vm/hugepage-shm.c b/Documentation/vm/hugepage-shm.c
new file mode 100644
index 000000000000..07956d8592c9
--- /dev/null
+++ b/Documentation/vm/hugepage-shm.c
@@ -0,0 +1,98 @@
1/*
2 * hugepage-shm:
3 *
4 * Example of using huge page memory in a user application using Sys V shared
5 * memory system calls. In this example the app is requesting 256MB of
6 * memory that is backed by huge pages. The application uses the flag
7 * SHM_HUGETLB in the shmget system call to inform the kernel that it is
8 * requesting huge pages.
9 *
10 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
11 * huge pages. That means that if one requires a fixed address, a huge page
12 * aligned address starting with 0x800000... will be required. If a fixed
13 * address is not required, the kernel will select an address in the proper
14 * range.
15 * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
16 *
17 * Note: The default shared memory limit is quite low on many kernels,
18 * you may need to increase it via:
19 *
20 * echo 268435456 > /proc/sys/kernel/shmmax
21 *
22 * This will increase the maximum size per shared memory segment to 256MB.
23 * The other limit that you will hit eventually is shmall which is the
24 * total amount of shared memory in pages. To set it to 16GB on a system
25 * with a 4kB pagesize do:
26 *
27 * echo 4194304 > /proc/sys/kernel/shmall
28 */
29
30#include <stdlib.h>
31#include <stdio.h>
32#include <sys/types.h>
33#include <sys/ipc.h>
34#include <sys/shm.h>
35#include <sys/mman.h>
36
37#ifndef SHM_HUGETLB
38#define SHM_HUGETLB 04000
39#endif
40
41#define LENGTH (256UL*1024*1024)
42
43#define dprintf(x) printf(x)
44
45/* Only ia64 requires this */
46#ifdef __ia64__
47#define ADDR (void *)(0x8000000000000000UL)
48#define SHMAT_FLAGS (SHM_RND)
49#else
50#define ADDR (void *)(0x0UL)
51#define SHMAT_FLAGS (0)
52#endif
53
54int main(void)
55{
56 int shmid;
57 unsigned long i;
58 char *shmaddr;
59
60 if ((shmid = shmget(2, LENGTH,
61 SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
62 perror("shmget");
63 exit(1);
64 }
65 printf("shmid: 0x%x\n", shmid);
66
67 shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
68 if (shmaddr == (char *)-1) {
69 perror("Shared memory attach failure");
70 shmctl(shmid, IPC_RMID, NULL);
71 exit(2);
72 }
73 printf("shmaddr: %p\n", shmaddr);
74
75 dprintf("Starting the writes:\n");
76 for (i = 0; i < LENGTH; i++) {
77 shmaddr[i] = (char)(i);
78 if (!(i % (1024 * 1024)))
79 dprintf(".");
80 }
81 dprintf("\n");
82
83 dprintf("Starting the Check...");
84 for (i = 0; i < LENGTH; i++)
85 if (shmaddr[i] != (char)i)
86 printf("\nIndex %lu mismatched\n", i);
87 dprintf("Done.\n");
88
89 if (shmdt((const void *)shmaddr) != 0) {
90 perror("Detach failure");
91 shmctl(shmid, IPC_RMID, NULL);
92 exit(3);
93 }
94
95 shmctl(shmid, IPC_RMID, NULL);
96
97 return 0;
98}
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index bc31636973e3..457634c1e03e 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -299,176 +299,11 @@ map_hugetlb.c.
299******************************************************************* 299*******************************************************************
300 300
301/* 301/*
302 * Example of using huge page memory in a user application using Sys V shared 302 * hugepage-shm: see Documentation/vm/hugepage-shm.c
303 * memory system calls. In this example the app is requesting 256MB of
304 * memory that is backed by huge pages. The application uses the flag
305 * SHM_HUGETLB in the shmget system call to inform the kernel that it is
306 * requesting huge pages.
307 *
308 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
309 * huge pages. That means that if one requires a fixed address, a huge page
310 * aligned address starting with 0x800000... will be required. If a fixed
311 * address is not required, the kernel will select an address in the proper
312 * range.
313 * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
314 *
315 * Note: The default shared memory limit is quite low on many kernels,
316 * you may need to increase it via:
317 *
318 * echo 268435456 > /proc/sys/kernel/shmmax
319 *
320 * This will increase the maximum size per shared memory segment to 256MB.
321 * The other limit that you will hit eventually is shmall which is the
322 * total amount of shared memory in pages. To set it to 16GB on a system
323 * with a 4kB pagesize do:
324 *
325 * echo 4194304 > /proc/sys/kernel/shmall
326 */ 303 */
327#include <stdlib.h>
328#include <stdio.h>
329#include <sys/types.h>
330#include <sys/ipc.h>
331#include <sys/shm.h>
332#include <sys/mman.h>
333
334#ifndef SHM_HUGETLB
335#define SHM_HUGETLB 04000
336#endif
337
338#define LENGTH (256UL*1024*1024)
339
340#define dprintf(x) printf(x)
341
342#define ADDR (void *)(0x0UL) /* let kernel choose address */
343#define SHMAT_FLAGS (0)
344
345int main(void)
346{
347 int shmid;
348 unsigned long i;
349 char *shmaddr;
350
351 if ((shmid = shmget(2, LENGTH,
352 SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
353 perror("shmget");
354 exit(1);
355 }
356 printf("shmid: 0x%x\n", shmid);
357
358 shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
359 if (shmaddr == (char *)-1) {
360 perror("Shared memory attach failure");
361 shmctl(shmid, IPC_RMID, NULL);
362 exit(2);
363 }
364 printf("shmaddr: %p\n", shmaddr);
365
366 dprintf("Starting the writes:\n");
367 for (i = 0; i < LENGTH; i++) {
368 shmaddr[i] = (char)(i);
369 if (!(i % (1024 * 1024)))
370 dprintf(".");
371 }
372 dprintf("\n");
373
374 dprintf("Starting the Check...");
375 for (i = 0; i < LENGTH; i++)
376 if (shmaddr[i] != (char)i)
377 printf("\nIndex %lu mismatched\n", i);
378 dprintf("Done.\n");
379
380 if (shmdt((const void *)shmaddr) != 0) {
381 perror("Detach failure");
382 shmctl(shmid, IPC_RMID, NULL);
383 exit(3);
384 }
385
386 shmctl(shmid, IPC_RMID, NULL);
387
388 return 0;
389}
390 304
391******************************************************************* 305*******************************************************************
392 306
393/* 307/*
394 * Example of using huge page memory in a user application using the mmap 308 * hugepage-mmap: see Documentation/vm/hugepage-mmap.c
395 * system call. Before running this application, make sure that the
396 * administrator has mounted the hugetlbfs filesystem (on some directory
397 * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
398 * example, the app is requesting memory of size 256MB that is backed by
399 * huge pages.
400 *
401 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
402 * huge pages. That means that if one requires a fixed address, a huge page
403 * aligned address starting with 0x800000... will be required. If a fixed
404 * address is not required, the kernel will select an address in the proper
405 * range.
406 * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
407 */ 309 */
408#include <stdlib.h>
409#include <stdio.h>
410#include <unistd.h>
411#include <sys/mman.h>
412#include <fcntl.h>
413
414#define FILE_NAME "/mnt/hugepagefile"
415#define LENGTH (256UL*1024*1024)
416#define PROTECTION (PROT_READ | PROT_WRITE)
417
418#define ADDR (void *)(0x0UL) /* let kernel choose address */
419#define FLAGS (MAP_SHARED)
420
421void check_bytes(char *addr)
422{
423 printf("First hex is %x\n", *((unsigned int *)addr));
424}
425
426void write_bytes(char *addr)
427{
428 unsigned long i;
429
430 for (i = 0; i < LENGTH; i++)
431 *(addr + i) = (char)i;
432}
433
434void read_bytes(char *addr)
435{
436 unsigned long i;
437
438 check_bytes(addr);
439 for (i = 0; i < LENGTH; i++)
440 if (*(addr + i) != (char)i) {
441 printf("Mismatch at %lu\n", i);
442 break;
443 }
444}
445
446int main(void)
447{
448 void *addr;
449 int fd;
450
451 fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
452 if (fd < 0) {
453 perror("Open failed");
454 exit(1);
455 }
456
457 addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
458 if (addr == MAP_FAILED) {
459 perror("mmap");
460 unlink(FILE_NAME);
461 exit(1);
462 }
463
464 printf("Returned address is %p\n", addr);
465 check_bytes(addr);
466 write_bytes(addr);
467 read_bytes(addr);
468
469 munmap(addr, LENGTH);
470 close(fd);
471 unlink(FILE_NAME);
472
473 return 0;
474}
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
index e2bdae37f499..eda1a6d3578a 100644
--- a/Documentation/vm/map_hugetlb.c
+++ b/Documentation/vm/map_hugetlb.c
@@ -19,7 +19,7 @@
19#define PROTECTION (PROT_READ | PROT_WRITE) 19#define PROTECTION (PROT_READ | PROT_WRITE)
20 20
21#ifndef MAP_HUGETLB 21#ifndef MAP_HUGETLB
22#define MAP_HUGETLB 0x40 22#define MAP_HUGETLB 0x40000 /* arch specific */
23#endif 23#endif
24 24
25/* Only ia64 requires this */ 25/* Only ia64 requires this */
@@ -31,12 +31,12 @@
31#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) 31#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
32#endif 32#endif
33 33
34void check_bytes(char *addr) 34static void check_bytes(char *addr)
35{ 35{
36 printf("First hex is %x\n", *((unsigned int *)addr)); 36 printf("First hex is %x\n", *((unsigned int *)addr));
37} 37}
38 38
39void write_bytes(char *addr) 39static void write_bytes(char *addr)
40{ 40{
41 unsigned long i; 41 unsigned long i;
42 42
@@ -44,7 +44,7 @@ void write_bytes(char *addr)
44 *(addr + i) = (char)i; 44 *(addr + i) = (char)i;
45} 45}
46 46
47void read_bytes(char *addr) 47static void read_bytes(char *addr)
48{ 48{
49 unsigned long i; 49 unsigned long i;
50 50
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index e93ad9425e2a..a200a386429d 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -1,41 +1,149 @@
1Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> 1Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
2 2
3The intent of this file is to have an uptodate, running commentary 3What is NUMA?
4from different people about NUMA specific code in the Linux vm. 4
5 5This question can be answered from a couple of perspectives: the
6What is NUMA? It is an architecture where the memory access times 6hardware view and the Linux software view.
7for different regions of memory from a given processor varies 7
8according to the "distance" of the memory region from the processor. 8From the hardware perspective, a NUMA system is a computer platform that
9Each region of memory to which access times are the same from any 9comprises multiple components or assemblies each of which may contain 0
10cpu, is called a node. On such architectures, it is beneficial if 10or more CPUs, local memory, and/or IO buses. For brevity and to
11the kernel tries to minimize inter node communications. Schemes 11disambiguate the hardware view of these physical components/assemblies
12for this range from kernel text and read-only data replication 12from the software abstraction thereof, we'll call the components/assemblies
13across nodes, and trying to house all the data structures that 13'cells' in this document.
14key components of the kernel need on memory on that node. 14
15 15Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
16Currently, all the numa support is to provide efficient handling 16of the system--although some components necessary for a stand-alone SMP system
17of widely discontiguous physical memory, so architectures which 17may not be populated on any given cell. The cells of the NUMA system are
18are not NUMA but can have huge holes in the physical address space 18connected together with some sort of system interconnect--e.g., a crossbar or
19can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. 19point-to-point link are common types of NUMA system interconnects. Both of
20 20these types of interconnects can be aggregated to create NUMA platforms with
21The initial port includes NUMAizing the bootmem allocator code by 21cells at multiple distances from other cells.
22encapsulating all the pieces of information into a bootmem_data_t 22
23structure. Node specific calls have been added to the allocator. 23For Linux, the NUMA platforms of interest are primarily what is known as Cache
24In theory, any platform which uses the bootmem allocator should 24Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible
25be able to put the bootmem and mem_map data structures anywhere 25to and accessible from any CPU attached to any cell and cache coherency
26it deems best. 26is handled in hardware by the processor caches and/or the system interconnect.
27 27
28Each node's page allocation data structures have also been encapsulated 28Memory access time and effective memory bandwidth varies depending on how far
29into a pg_data_t. The bootmem_data_t is just one part of this. To 29away the cell containing the CPU or IO bus making the memory access is from the
30make the code look uniform between NUMA and regular UMA platforms, 30cell containing the target memory. For example, access to memory by CPUs
31UMA platforms have a statically allocated pg_data_t too (contig_page_data). 31attached to the same cell will experience faster access times and higher
32For the sake of uniformity, the function num_online_nodes() is also defined 32bandwidths than accesses to memory on other, remote cells. NUMA platforms
33for all platforms. As we run benchmarks, we might decide to NUMAize 33can have cells at multiple remote distances from any given cell.
34more variables like low_on_memory, nr_free_pages etc into the pg_data_t. 34
35 35Platform vendors don't build NUMA systems just to make software developers'
36The NUMA aware page allocation code currently tries to allocate pages 36lives interesting. Rather, this architecture is a means to provide scalable
37from different nodes in a round robin manner. This will be changed to 37memory bandwidth. However, to achieve scalable memory bandwidth, system and
38do concentratic circle search, starting from current node, once the 38application software must arrange for a large majority of the memory references
39NUMA port achieves more maturity. The call alloc_pages_node has been 39[cache misses] to be to "local" memory--memory on the same cell, if any--or
40added, so that drivers can make the call and not worry about whether 40to the closest cell with memory.
41it is running on a NUMA or UMA platform. 41
42This leads to the Linux software view of a NUMA system:
43
44Linux divides the system's hardware resources into multiple software
45abstractions called "nodes". Linux maps the nodes onto the physical cells
46of the hardware platform, abstracting away some of the details for some
47architectures. As with physical cells, software nodes may contain 0 or more
48CPUs, memory and/or IO buses. And, again, memory accesses to memory on
49"closer" nodes--nodes that map to closer cells--will generally experience
50faster access times and higher effective bandwidth than accesses to more
51remote cells.
52
53For some architectures, such as x86, Linux will "hide" any node representing a
54physical cell that has no memory attached, and reassign any CPUs attached to
55that cell to a node representing a cell that does have memory. Thus, on
56these architectures, one cannot assume that all CPUs that Linux associates with
57a given node will see the same local memory access times and bandwidth.
58
59In addition, for some architectures, again x86 is an example, Linux supports
60the emulation of additional nodes. For NUMA emulation, linux will carve up
61the existing nodes--or the system memory for non-NUMA platforms--into multiple
62nodes. Each emulated node will manage a fraction of the underlying cells'
63physical memory. NUMA emluation is useful for testing NUMA kernel and
64application features on non-NUMA platforms, and as a sort of memory resource
65management mechanism when used together with cpusets.
66[see Documentation/cgroups/cpusets.txt]
67
68For each node with memory, Linux constructs an independent memory management
69subsystem, complete with its own free page lists, in-use page lists, usage
70statistics and locks to mediate access. In addition, Linux constructs for
71each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
72an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a
73selected zone/node cannot satisfy the allocation request. This situation,
74when a zone has no available memory to satisfy a request, is called
75"overflow" or "fallback".
76
77Because some nodes contain multiple zones containing different types of
78memory, Linux must decide whether to order the zonelists such that allocations
79fall back to the same zone type on a different node, or to a different zone
80type on the same node. This is an important consideration because some zones,
81such as DMA or DMA32, represent relatively scarce resources. Linux chooses
82a default zonelist order based on the sizes of the various zone types relative
83to the total memory of the node and the total memory of the system. The
84default zonelist order may be overridden using the numa_zonelist_order kernel
85boot parameter or sysctl. [see Documentation/kernel-parameters.txt and
86Documentation/sysctl/vm.txt]
87
88By default, Linux will attempt to satisfy memory allocation requests from the
89node to which the CPU that executes the request is assigned. Specifically,
90Linux will attempt to allocate from the first node in the appropriate zonelist
91for the node where the request originates. This is called "local allocation."
92If the "local" node cannot satisfy the request, the kernel will examine other
93nodes' zones in the selected zonelist looking for the first zone in the list
94that can satisfy the request.
95
96Local allocation will tend to keep subsequent access to the allocated memory
97"local" to the underlying physical resources and off the system interconnect--
98as long as the task on whose behalf the kernel allocated some memory does not
99later migrate away from that memory. The Linux scheduler is aware of the
100NUMA topology of the platform--embodied in the "scheduling domains" data
101structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler
102attempts to minimize task migration to distant scheduling domains. However,
103the scheduler does not take a task's NUMA footprint into account directly.
104Thus, under sufficient imbalance, tasks can migrate between nodes, remote
105from their initial node and kernel data structures.
106
107System administrators and application designers can restrict a task's migration
108to improve NUMA locality using various CPU affinity command line interfaces,
109such as taskset(1) and numactl(1), and program interfaces such as
110sched_setaffinity(2). Further, one can modify the kernel's default local
111allocation behavior using Linux NUMA memory policy.
112[see Documentation/vm/numa_memory_policy.]
113
114System administrators can restrict the CPUs and nodes' memories that a non-
115privileged user can specify in the scheduling or NUMA commands and functions
116using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt]
117
118On architectures that do not hide memoryless nodes, Linux will include only
119zones [nodes] with memory in the zonelists. This means that for a memoryless
120node the "local memory node"--the node of the first zone in CPU's node's
121zonelist--will not be the node itself. Rather, it will be the node that the
122kernel selected as the nearest node with memory when it built the zonelists.
123So, default, local allocations will succeed with the kernel supplying the
124closest available memory. This is a consequence of the same mechanism that
125allows such allocations to fallback to other nearby nodes when a node that
126does contain memory overflows.
127
128Some kernel allocations do not want or cannot tolerate this allocation fallback
129behavior. Rather they want to be sure they get memory from the specified node
130or get notified that the node has no free memory. This is usually the case when
131a subsystem allocates per CPU memory resources, for example.
132
133A typical model for making such an allocation is to obtain the node id of the
134node to which the "current CPU" is attached using one of the kernel's
135numa_node_id() or CPU_to_node() functions and then request memory from only
136the node id returned. When such an allocation fails, the requesting subsystem
137may revert to its own fallback path. The slab kernel memory allocator is an
138example of this. Or, the subsystem may choose to disable or not to enable
139itself on allocation failure. The kernel profiling subsystem is an example of
140this.
141
142If the architecture supports--does not hide--memoryless nodes, then CPUs
143attached to memoryless nodes would always incur the fallback path overhead
144or some subsystems would fail to initialize if they attempted to allocated
145memory exclusively from a node without memory. To support such
146architectures transparently, kernel subsystems can use the numa_mem_id()
147or cpu_to_mem() function to locate the "local memory node" for the calling or
148specified CPU. Again, this is the same node from which default, local page
149allocations will be attempted.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index be45dbb9d7f2..6690fc34ef6d 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -45,7 +45,7 @@ most general to most specific:
45 to establish the task policy for a child task exec()'d from an 45 to establish the task policy for a child task exec()'d from an
46 executable image that has no awareness of memory policy. See the 46 executable image that has no awareness of memory policy. See the
47 MEMORY POLICY APIS section, below, for an overview of the system call 47 MEMORY POLICY APIS section, below, for an overview of the system call
48 that a task may use to set/change it's task/process policy. 48 that a task may use to set/change its task/process policy.
49 49
50 In a multi-threaded task, task policies apply only to the thread 50 In a multi-threaded task, task policies apply only to the thread
51 [Linux kernel task] that installs the policy and any threads 51 [Linux kernel task] that installs the policy and any threads
@@ -301,7 +301,7 @@ decrement this reference count, respectively. mpol_put() will only free
301the structure back to the mempolicy kmem cache when the reference count 301the structure back to the mempolicy kmem cache when the reference count
302goes to zero. 302goes to zero.
303 303
304When a new memory policy is allocated, it's reference count is initialized 304When a new memory policy is allocated, its reference count is initialized
305to '1', representing the reference held by the task that is installing the 305to '1', representing the reference held by the task that is installing the
306new policy. When a pointer to a memory policy structure is stored in another 306new policy. When a pointer to a memory policy structure is stored in another
307structure, another reference is added, as the task's reference will be dropped 307structure, another reference is added, as the task's reference will be dropped
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index b37300edf27c..07375e73981a 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -41,6 +41,7 @@ Possible debug options are
41 P Poisoning (object and padding) 41 P Poisoning (object and padding)
42 U User tracking (free and alloc) 42 U User tracking (free and alloc)
43 T Trace (please only use on single slabs) 43 T Trace (please only use on single slabs)
44 A Toggle failslab filter mark for the cache
44 O Switch debugging off for caches that would have 45 O Switch debugging off for caches that would have
45 caused higher minimum slab orders 46 caused higher minimum slab orders
46 - Switch all debugging off (useful if the kernel is 47 - Switch all debugging off (useful if the kernel is
diff --git a/Documentation/volatile-considered-harmful.txt b/Documentation/volatile-considered-harmful.txt
index 991c26a6ef64..db0cb228d64a 100644
--- a/Documentation/volatile-considered-harmful.txt
+++ b/Documentation/volatile-considered-harmful.txt
@@ -63,9 +63,9 @@ way to perform a busy wait is:
63 cpu_relax(); 63 cpu_relax();
64 64
65The cpu_relax() call can lower CPU power consumption or yield to a 65The cpu_relax() call can lower CPU power consumption or yield to a
66hyperthreaded twin processor; it also happens to serve as a memory barrier, 66hyperthreaded twin processor; it also happens to serve as a compiler
67so, once again, volatile is unnecessary. Of course, busy-waiting is 67barrier, so, once again, volatile is unnecessary. Of course, busy-
68generally an anti-social act to begin with. 68waiting is generally an anti-social act to begin with.
69 69
70There are still a few rare situations where volatile makes sense in the 70There are still a few rare situations where volatile makes sense in the
71kernel: 71kernel:
diff --git a/Documentation/voyager.txt b/Documentation/voyager.txt
deleted file mode 100644
index 2749af552cdf..000000000000
--- a/Documentation/voyager.txt
+++ /dev/null
@@ -1,95 +0,0 @@
1Running Linux on the Voyager Architecture
2=========================================
3
4For full details and current project status, see
5
6http://www.hansenpartnership.com/voyager
7
8The voyager architecture was designed by NCR in the mid 80s to be a
9fully SMP capable RAS computing architecture built around intel's 486
10chip set. The voyager came in three levels of architectural
11sophistication: 3,4 and 5 --- 1 and 2 never made it out of prototype.
12The linux patches support only the Level 5 voyager architecture (any
13machine class 3435 and above).
14
15The Voyager Architecture
16------------------------
17
18Voyager machines consist of a Baseboard with a 386 diagnostic
19processor, a Power Supply Interface (PSI) a Primary and possibly
20Secondary Microchannel bus and between 2 and 20 voyager slots. The
21voyager slots can be populated with memory and cpu cards (up to 4GB
22memory and from 1 486 to 32 Pentium Pro processors). Internally, the
23voyager has a dual arbitrated system bus and a configuration and test
24bus (CAT). The voyager bus speed is 40MHz. Therefore (since all
25voyager cards are dual ported for each system bus) the maximum
26transfer rate is 320Mb/s but only if you have your slot configuration
27tuned (only memory cards can communicate with both busses at once, CPU
28cards utilise them one at a time).
29
30Voyager SMP
31-----------
32
33Since voyager was the first intel based SMP system, it is slightly
34more primitive than the Intel IO-APIC approach to SMP. Voyager allows
35arbitrary interrupt routing (including processor affinity routing) of
36all 16 PC type interrupts. However it does this by using a modified
375259 master/slave chip set instead of an APIC bus. Additionally,
38voyager supports Cross Processor Interrupts (CPI) equivalent to the
39APIC IPIs. There are two routed voyager interrupt lines provided to
40each slot.
41
42Processor Cards
43---------------
44
45These come in single, dyadic and quad configurations (the quads are
46problematic--see later). The maximum configuration is 8 quad cards
47for 32 way SMP.
48
49Quad Processors
50---------------
51
52Because voyager only supplies two interrupt lines to each Processor
53card, the Quad processors have to be configured (and Bootstrapped) in
54as a pair of Master/Slave processors.
55
56In fact, most Quad cards only accept one VIC interrupt line, so they
57have one interrupt handling processor (called the VIC extended
58processor) and three non-interrupt handling processors.
59
60Current Status
61--------------
62
63The System will boot on Mono, Dyad and Quad cards. There was
64originally a Quad boot problem which has been fixed by proper gdt
65alignment in the initial boot loader. If you still cannot get your
66voyager system to boot, email me at:
67
68<J.E.J.Bottomley@HansenPartnership.com>
69
70
71The Quad cards now support using the separate Quad CPI vectors instead
72of going through the VIC mailbox system.
73
74The Level 4 architecture (3430 and 3360 Machines) should also work
75fine.
76
77Dump Switch
78-----------
79
80The voyager dump switch sends out a broadcast NMI which the voyager
81code intercepts and does a task dump.
82
83Power Switch
84------------
85
86The front panel power switch is intercepted by the kernel and should
87cause a system shutdown and power off.
88
89A Note About Mixed CPU Systems
90------------------------------
91
92Linux isn't designed to handle mixed CPU systems very well. In order
93to get everything going you *must* make sure that your lowest
94capability CPU is used for booting. Also, mixing CPU classes
95(e.g. 486 and 586) is really not going to work very well at all.
diff --git a/Documentation/w1/w1.generic b/Documentation/w1/w1.generic
index e3333eec4320..212f4ac31c01 100644
--- a/Documentation/w1/w1.generic
+++ b/Documentation/w1/w1.generic
@@ -25,7 +25,7 @@ When a w1 master driver registers with the w1 subsystem, the following occurs:
25 - sysfs entries for that w1 master are created 25 - sysfs entries for that w1 master are created
26 - the w1 bus is periodically searched for new slave devices 26 - the w1 bus is periodically searched for new slave devices
27 27
28When a device is found on the bus, w1 core checks if driver for it's family is 28When a device is found on the bus, w1 core checks if driver for its family is
29loaded. If so, the family driver is attached to the slave. 29loaded. If so, the family driver is attached to the slave.
30If there is no driver for the family, default one is assigned, which allows to perform 30If there is no driver for the family, default one is assigned, which allows to perform
31almost any kind of operations. Each logical operation is a transaction 31almost any kind of operations. Each logical operation is a transaction
diff --git a/Documentation/watchdog/00-INDEX b/Documentation/watchdog/00-INDEX
index c3ea47e507fe..ee994513a9b1 100644
--- a/Documentation/watchdog/00-INDEX
+++ b/Documentation/watchdog/00-INDEX
@@ -1,10 +1,15 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3hpwdt.txt
4 - information on the HP iLO2 NMI watchdog
3pcwd-watchdog.txt 5pcwd-watchdog.txt
4 - documentation for Berkshire Products PC Watchdog ISA cards. 6 - documentation for Berkshire Products PC Watchdog ISA cards.
5src/ 7src/
6 - directory holding watchdog related example programs. 8 - directory holding watchdog related example programs.
7watchdog-api.txt 9watchdog-api.txt
8 - description of the Linux Watchdog driver API. 10 - description of the Linux Watchdog driver API.
11watchdog-parameters.txt
12 - information on driver parameters (for drivers other than
13 the ones that have driver-specific files here)
9wdt.txt 14wdt.txt
10 - description of the Watchdog Timer Interfaces for Linux. 15 - description of the Watchdog Timer Interfaces for Linux.
diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c
index 4cf72f3fa8e9..ba45803a2216 100644
--- a/Documentation/watchdog/src/watchdog-simple.c
+++ b/Documentation/watchdog/src/watchdog-simple.c
@@ -17,9 +17,6 @@ int main(void)
17 ret = -1; 17 ret = -1;
18 break; 18 break;
19 } 19 }
20 ret = fsync(fd);
21 if (ret)
22 break;
23 sleep(10); 20 sleep(10);
24 } 21 }
25 close(fd); 22 close(fd);
diff --git a/Documentation/watchdog/src/watchdog-test.c b/Documentation/watchdog/src/watchdog-test.c
index a750532ffcf8..63fdc34ceb98 100644
--- a/Documentation/watchdog/src/watchdog-test.c
+++ b/Documentation/watchdog/src/watchdog-test.c
@@ -31,6 +31,8 @@ static void keep_alive(void)
31 */ 31 */
32int main(int argc, char *argv[]) 32int main(int argc, char *argv[])
33{ 33{
34 int flags;
35
34 fd = open("/dev/watchdog", O_WRONLY); 36 fd = open("/dev/watchdog", O_WRONLY);
35 37
36 if (fd == -1) { 38 if (fd == -1) {
@@ -41,12 +43,14 @@ int main(int argc, char *argv[])
41 43
42 if (argc > 1) { 44 if (argc > 1) {
43 if (!strncasecmp(argv[1], "-d", 2)) { 45 if (!strncasecmp(argv[1], "-d", 2)) {
44 ioctl(fd, WDIOC_SETOPTIONS, WDIOS_DISABLECARD); 46 flags = WDIOS_DISABLECARD;
47 ioctl(fd, WDIOC_SETOPTIONS, &flags);
45 fprintf(stderr, "Watchdog card disabled.\n"); 48 fprintf(stderr, "Watchdog card disabled.\n");
46 fflush(stderr); 49 fflush(stderr);
47 exit(0); 50 exit(0);
48 } else if (!strncasecmp(argv[1], "-e", 2)) { 51 } else if (!strncasecmp(argv[1], "-e", 2)) {
49 ioctl(fd, WDIOC_SETOPTIONS, WDIOS_ENABLECARD); 52 flags = WDIOS_ENABLECARD;
53 ioctl(fd, WDIOC_SETOPTIONS, &flags);
50 fprintf(stderr, "Watchdog card enabled.\n"); 54 fprintf(stderr, "Watchdog card enabled.\n");
51 fflush(stderr); 55 fflush(stderr);
52 exit(0); 56 exit(0);
diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.txt
index 4cc4ba9d7150..eb7132ed8bbc 100644
--- a/Documentation/watchdog/watchdog-api.txt
+++ b/Documentation/watchdog/watchdog-api.txt
@@ -222,11 +222,10 @@ returned value is the temperature in degrees fahrenheit.
222 ioctl(fd, WDIOC_GETTEMP, &temperature); 222 ioctl(fd, WDIOC_GETTEMP, &temperature);
223 223
224Finally the SETOPTIONS ioctl can be used to control some aspects of 224Finally the SETOPTIONS ioctl can be used to control some aspects of
225the cards operation; right now the pcwd driver is the only one 225the cards operation.
226supporting this ioctl.
227 226
228 int options = 0; 227 int options = 0;
229 ioctl(fd, WDIOC_SETOPTIONS, options); 228 ioctl(fd, WDIOC_SETOPTIONS, &options);
230 229
231The following options are available: 230The following options are available:
232 231
diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt
new file mode 100644
index 000000000000..41c95cc1dc1f
--- /dev/null
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -0,0 +1,390 @@
1This file provides information on the module parameters of many of
2the Linux watchdog drivers. Watchdog driver parameter specs should
3be listed here unless the driver has its own driver-specific information
4file.
5
6
7See Documentation/kernel-parameters.txt for information on
8providing kernel parameters for builtin drivers versus loadable
9modules.
10
11
12-------------------------------------------------
13acquirewdt:
14wdt_stop: Acquire WDT 'stop' io port (default 0x43)
15wdt_start: Acquire WDT 'start' io port (default 0x443)
16nowayout: Watchdog cannot be stopped once started
17 (default=kernel config parameter)
18-------------------------------------------------
19advantechwdt:
20wdt_stop: Advantech WDT 'stop' io port (default 0x443)
21wdt_start: Advantech WDT 'start' io port (default 0x443)
22timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60.
23nowayout: Watchdog cannot be stopped once started
24 (default=kernel config parameter)
25-------------------------------------------------
26alim1535_wdt:
27timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60
28nowayout: Watchdog cannot be stopped once started
29 (default=kernel config parameter)
30-------------------------------------------------
31alim7101_wdt:
32timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30
33use_gpio: Use the gpio watchdog (required by old cobalt boards).
34 default=0/off/no
35nowayout: Watchdog cannot be stopped once started
36 (default=kernel config parameter)
37-------------------------------------------------
38ar7_wdt:
39margin: Watchdog margin in seconds (default=60)
40nowayout: Disable watchdog shutdown on close
41 (default=kernel config parameter)
42-------------------------------------------------
43at32ap700x_wdt:
44timeout: Timeout value. Limited to be 1 or 2 seconds. (default=2)
45nowayout: Watchdog cannot be stopped once started
46 (default=kernel config parameter)
47-------------------------------------------------
48at91rm9200_wdt:
49wdt_time: Watchdog time in seconds. (default=5)
50nowayout: Watchdog cannot be stopped once started
51 (default=kernel config parameter)
52-------------------------------------------------
53at91sam9_wdt:
54heartbeat: Watchdog heartbeats in seconds. (default = 15)
55nowayout: Watchdog cannot be stopped once started
56 (default=kernel config parameter)
57-------------------------------------------------
58bcm47xx_wdt:
59wdt_time: Watchdog time in seconds. (default=30)
60nowayout: Watchdog cannot be stopped once started
61 (default=kernel config parameter)
62-------------------------------------------------
63bfin_wdt:
64timeout: Watchdog timeout in seconds. (1<=timeout<=((2^32)/SCLK), default=20)
65nowayout: Watchdog cannot be stopped once started
66 (default=kernel config parameter)
67-------------------------------------------------
68coh901327_wdt:
69margin: Watchdog margin in seconds (default 60s)
70-------------------------------------------------
71cpu5wdt:
72port: base address of watchdog card, default is 0x91
73verbose: be verbose, default is 0 (no)
74ticks: count down ticks, default is 10000
75-------------------------------------------------
76cpwd:
77wd0_timeout: Default watchdog0 timeout in 1/10secs
78wd1_timeout: Default watchdog1 timeout in 1/10secs
79wd2_timeout: Default watchdog2 timeout in 1/10secs
80-------------------------------------------------
81davinci_wdt:
82heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60
83-------------------------------------------------
84ep93xx_wdt:
85nowayout: Watchdog cannot be stopped once started
86timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD)
87-------------------------------------------------
88eurotechwdt:
89nowayout: Watchdog cannot be stopped once started
90 (default=kernel config parameter)
91io: Eurotech WDT io port (default=0x3f0)
92irq: Eurotech WDT irq (default=10)
93ev: Eurotech WDT event type (default is `int')
94-------------------------------------------------
95gef_wdt:
96nowayout: Watchdog cannot be stopped once started
97 (default=kernel config parameter)
98-------------------------------------------------
99geodewdt:
100timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60.
101nowayout: Watchdog cannot be stopped once started
102 (default=kernel config parameter)
103-------------------------------------------------
104i6300esb:
105heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30)
106nowayout: Watchdog cannot be stopped once started
107 (default=kernel config parameter)
108-------------------------------------------------
109iTCO_wdt:
110heartbeat: Watchdog heartbeat in seconds.
111 (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30)
112nowayout: Watchdog cannot be stopped once started
113 (default=kernel config parameter)
114-------------------------------------------------
115iTCO_vendor_support:
116vendorsupport: iTCO vendor specific support mode, default=0 (none),
117 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS
118-------------------------------------------------
119ib700wdt:
120timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30.
121nowayout: Watchdog cannot be stopped once started
122 (default=kernel config parameter)
123-------------------------------------------------
124ibmasr:
125nowayout: Watchdog cannot be stopped once started
126 (default=kernel config parameter)
127-------------------------------------------------
128indydog:
129nowayout: Watchdog cannot be stopped once started
130 (default=kernel config parameter)
131-------------------------------------------------
132iop_wdt:
133nowayout: Watchdog cannot be stopped once started
134 (default=kernel config parameter)
135-------------------------------------------------
136it8712f_wdt:
137margin: Watchdog margin in seconds (default 60)
138nowayout: Disable watchdog shutdown on close
139 (default=kernel config parameter)
140-------------------------------------------------
141it87_wdt:
142nogameport: Forbid the activation of game port, default=0
143exclusive: Watchdog exclusive device open, default=1
144timeout: Watchdog timeout in seconds, default=60
145testmode: Watchdog test mode (1 = no reboot), default=0
146nowayout: Watchdog cannot be stopped once started
147 (default=kernel config parameter)
148-------------------------------------------------
149ixp2000_wdt:
150heartbeat: Watchdog heartbeat in seconds (default 60s)
151nowayout: Watchdog cannot be stopped once started
152 (default=kernel config parameter)
153-------------------------------------------------
154ixp4xx_wdt:
155heartbeat: Watchdog heartbeat in seconds (default 60s)
156nowayout: Watchdog cannot be stopped once started
157 (default=kernel config parameter)
158-------------------------------------------------
159ks8695_wdt:
160wdt_time: Watchdog time in seconds. (default=5)
161nowayout: Watchdog cannot be stopped once started
162 (default=kernel config parameter)
163-------------------------------------------------
164machzwd:
165nowayout: Watchdog cannot be stopped once started
166 (default=kernel config parameter)
167action: after watchdog resets, generate:
168 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI
169-------------------------------------------------
170max63xx_wdt:
171heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60
172nowayout: Watchdog cannot be stopped once started
173 (default=kernel config parameter)
174nodelay: Force selection of a timeout setting without initial delay
175 (max6373/74 only, default=0)
176-------------------------------------------------
177mixcomwd:
178nowayout: Watchdog cannot be stopped once started
179 (default=kernel config parameter)
180-------------------------------------------------
181mpc8xxx_wdt:
182timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535)
183reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset
184nowayout: Watchdog cannot be stopped once started
185 (default=kernel config parameter)
186-------------------------------------------------
187mpcore_wdt:
188mpcore_margin: MPcore timer margin in seconds.
189 (0 < mpcore_margin < 65536, default=60)
190nowayout: Watchdog cannot be stopped once started
191 (default=kernel config parameter)
192mpcore_noboot: MPcore watchdog action, set to 1 to ignore reboots,
193 0 to reboot (default=0
194-------------------------------------------------
195mv64x60_wdt:
196nowayout: Watchdog cannot be stopped once started
197 (default=kernel config parameter)
198-------------------------------------------------
199nuc900_wdt:
200heartbeat: Watchdog heartbeats in seconds.
201 (default = 15)
202nowayout: Watchdog cannot be stopped once started
203 (default=kernel config parameter)
204-------------------------------------------------
205omap_wdt:
206timer_margin: initial watchdog timeout (in seconds)
207-------------------------------------------------
208orion_wdt:
209heartbeat: Initial watchdog heartbeat in seconds
210nowayout: Watchdog cannot be stopped once started
211 (default=kernel config parameter)
212-------------------------------------------------
213pc87413_wdt:
214io: pc87413 WDT I/O port (default: io).
215timeout: Watchdog timeout in minutes (default=timeout).
216nowayout: Watchdog cannot be stopped once started
217 (default=kernel config parameter)
218-------------------------------------------------
219pika_wdt:
220heartbeat: Watchdog heartbeats in seconds. (default = 15)
221nowayout: Watchdog cannot be stopped once started
222 (default=kernel config parameter)
223-------------------------------------------------
224pnx4008_wdt:
225heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19
226nowayout: Set to 1 to keep watchdog running after device release
227-------------------------------------------------
228pnx833x_wdt:
229timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds)
230nowayout: Watchdog cannot be stopped once started
231 (default=kernel config parameter)
232start_enabled: Watchdog is started on module insertion (default=1)
233-------------------------------------------------
234rc32434_wdt:
235timeout: Watchdog timeout value, in seconds (default=20)
236nowayout: Watchdog cannot be stopped once started
237 (default=kernel config parameter)
238-------------------------------------------------
239riowd:
240riowd_timeout: Watchdog timeout in minutes (default=1)
241-------------------------------------------------
242s3c2410_wdt:
243tmr_margin: Watchdog tmr_margin in seconds. (default=15)
244tmr_atboot: Watchdog is started at boot time if set to 1, default=0
245nowayout: Watchdog cannot be stopped once started
246 (default=kernel config parameter)
247soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot
248debug: Watchdog debug, set to >1 for debug, (default 0)
249-------------------------------------------------
250sa1100_wdt:
251margin: Watchdog margin in seconds (default 60s)
252-------------------------------------------------
253sb_wdog:
254timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs)
255-------------------------------------------------
256sbc60xxwdt:
257wdt_stop: SBC60xx WDT 'stop' io port (default 0x45)
258wdt_start: SBC60xx WDT 'start' io port (default 0x443)
259timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
260nowayout: Watchdog cannot be stopped once started
261 (default=kernel config parameter)
262-------------------------------------------------
263sbc7240_wdt:
264timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30)
265nowayout: Disable watchdog when closing device file
266-------------------------------------------------
267sbc8360:
268timeout: Index into timeout table (0-63) (default=27 (60s))
269nowayout: Watchdog cannot be stopped once started
270 (default=kernel config parameter)
271-------------------------------------------------
272sbc_epx_c3:
273nowayout: Watchdog cannot be stopped once started
274 (default=kernel config parameter)
275-------------------------------------------------
276sbc_fitpc2_wdt:
277margin: Watchdog margin in seconds (default 60s)
278nowayout: Watchdog cannot be stopped once started
279-------------------------------------------------
280sc1200wdt:
281isapnp: When set to 0 driver ISA PnP support will be disabled (default=1)
282io: io port
283timeout: range is 0-255 minutes, default is 1
284nowayout: Watchdog cannot be stopped once started
285 (default=kernel config parameter)
286-------------------------------------------------
287sc520_wdt:
288timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30)
289nowayout: Watchdog cannot be stopped once started
290 (default=kernel config parameter)
291-------------------------------------------------
292sch311x_wdt:
293force_id: Override the detected device ID
294therm_trip: Should a ThermTrip trigger the reset generator
295timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60
296nowayout: Watchdog cannot be stopped once started
297 (default=kernel config parameter)
298-------------------------------------------------
299scx200_wdt:
300margin: Watchdog margin in seconds
301nowayout: Disable watchdog shutdown on close
302-------------------------------------------------
303shwdt:
304clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms)
305 to 0x7 (5.25ms). (default=7)
306heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30
307nowayout: Watchdog cannot be stopped once started
308 (default=kernel config parameter)
309-------------------------------------------------
310smsc37b787_wdt:
311timeout: range is 1-255 units, default is 60
312nowayout: Watchdog cannot be stopped once started
313 (default=kernel config parameter)
314-------------------------------------------------
315softdog:
316soft_margin: Watchdog soft_margin in seconds.
317 (0 < soft_margin < 65536, default=60)
318nowayout: Watchdog cannot be stopped once started
319 (default=kernel config parameter)
320soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot
321 (default=0)
322-------------------------------------------------
323stmp3xxx_wdt:
324heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19
325-------------------------------------------------
326ts72xx_wdt:
327timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8)
328nowayout: Disable watchdog shutdown on close
329-------------------------------------------------
330twl4030_wdt:
331nowayout: Watchdog cannot be stopped once started
332 (default=kernel config parameter)
333-------------------------------------------------
334txx9wdt:
335timeout: Watchdog timeout in seconds. (0<timeout<N, default=60)
336nowayout: Watchdog cannot be stopped once started
337 (default=kernel config parameter)
338-------------------------------------------------
339w83627hf_wdt:
340wdt_io: w83627hf/thf WDT io port (default 0x2E)
341timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
342nowayout: Watchdog cannot be stopped once started
343 (default=kernel config parameter)
344-------------------------------------------------
345w83697hf_wdt:
346wdt_io: w83697hf/hg WDT io port (default 0x2e, 0 = autodetect)
347timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
348nowayout: Watchdog cannot be stopped once started
349 (default=kernel config parameter)
350early_disable: Watchdog gets disabled at boot time (default=1)
351-------------------------------------------------
352w83697ug_wdt:
353wdt_io: w83697ug/uf WDT io port (default 0x2e)
354timeout: Watchdog timeout in seconds. 1<= timeout <=255 (default=60)
355nowayout: Watchdog cannot be stopped once started
356 (default=kernel config parameter)
357-------------------------------------------------
358w83877f_wdt:
359timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30)
360nowayout: Watchdog cannot be stopped once started
361 (default=kernel config parameter)
362-------------------------------------------------
363w83977f_wdt:
364timeout: Watchdog timeout in seconds (15..7635), default=45)
365testmode: Watchdog testmode (1 = no reboot), default=0
366nowayout: Watchdog cannot be stopped once started
367 (default=kernel config parameter)
368-------------------------------------------------
369wafer5823wdt:
370timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60.
371nowayout: Watchdog cannot be stopped once started
372 (default=kernel config parameter)
373-------------------------------------------------
374wdt285:
375soft_margin: Watchdog timeout in seconds (default=60)
376-------------------------------------------------
377wdt977:
378timeout: Watchdog timeout in seconds (60..15300, default=60)
379testmode: Watchdog testmode (1 = no reboot), default=0
380nowayout: Watchdog cannot be stopped once started
381 (default=kernel config parameter)
382-------------------------------------------------
383wm831x_wdt:
384nowayout: Watchdog cannot be stopped once started
385 (default=kernel config parameter)
386-------------------------------------------------
387wm8350_wdt:
388nowayout: Watchdog cannot be stopped once started
389 (default=kernel config parameter)
390-------------------------------------------------
diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.txt
index 03fd756d976d..061c2e35384f 100644
--- a/Documentation/watchdog/wdt.txt
+++ b/Documentation/watchdog/wdt.txt
@@ -14,14 +14,22 @@ reboot will depend on the state of the machines and interrupts. The hardware
14boards physically pull the machine down off their own onboard timers and 14boards physically pull the machine down off their own onboard timers and
15will reboot from almost anything. 15will reboot from almost anything.
16 16
17A second temperature monitoring interface is available on the WDT501P cards 17A second temperature monitoring interface is available on the WDT501P cards.
18This provides /dev/temperature. This is the machine internal temperature in 18This provides /dev/temperature. This is the machine internal temperature in
19degrees Fahrenheit. Each read returns a single byte giving the temperature. 19degrees Fahrenheit. Each read returns a single byte giving the temperature.
20 20
21The third interface logs kernel messages on additional alert events. 21The third interface logs kernel messages on additional alert events.
22 22
23The wdt card cannot be safely probed for. Instead you need to pass 23The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to
24wdt=ioaddr,irq as a boot parameter - eg "wdt=0x240,11". 24pass IO address and IRQ boot parameters. E.g.:
25 wdt.io=0x240 wdt.irq=11
26
27Other "wdt" driver parameters are:
28 heartbeat Watchdog heartbeat in seconds (default 60)
29 nowayout Watchdog cannot be stopped once started (kernel
30 build parameter)
31 tachometer WDT501-P Fan Tachometer support (0=disable, default=0)
32 type WDT501-P Card type (500 or 501, default=500)
25 33
26Features 34Features
27-------- 35--------
@@ -40,4 +48,3 @@ Minor numbers are however allocated for it.
40 48
41 49
42Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c 50Example Watchdog Driver: see Documentation/watchdog/src/watchdog-simple.c
43
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8bc7d3..7fbbaf85f5b7 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -166,19 +166,13 @@ NUMA
166 166
167 numa=noacpi Don't parse the SRAT table for NUMA setup 167 numa=noacpi Don't parse the SRAT table for NUMA setup
168 168
169 numa=fake=CMDLINE 169 numa=fake=<size>[MG]
170 If a number, fakes CMDLINE nodes and ignores NUMA setup of the 170 If given as a memory unit, fills all system RAM with nodes of
171 actual machine. Otherwise, system memory is configured 171 size interleaved over physical nodes.
172 depending on the sizes and coefficients listed. For example: 172
173 numa=fake=2*512,1024,4*256,*128 173 numa=fake=<N>
174 gives two 512M nodes, a 1024M node, four 256M nodes, and the 174 If given as an integer, fills all system RAM with N fake nodes
175 rest split into 128M chunks. If the last character of CMDLINE 175 interleaved over physical nodes.
176 is a *, the remaining memory is divided up equally among its
177 coefficient:
178 numa=fake=2*512,2*
179 gives two 512M nodes and the rest split into two nodes.
180 Otherwise, the remaining system RAM is allocated to an
181 additional node.
182 176
183ACPI 177ACPI
184 178