aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX4
-rw-r--r--Documentation/ABI/testing/debugfs-kmemtrace71
-rw-r--r--Documentation/ABI/testing/debugfs-pktcdvd6
-rw-r--r--Documentation/ABI/testing/ima_policy61
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci70
-rw-r--r--Documentation/ABI/testing/sysfs-class-regulator57
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-acpi8
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/DMA-API.txt106
-rw-r--r--Documentation/DMA-mapping.txt18
-rw-r--r--Documentation/DocBook/.gitignore4
-rw-r--r--Documentation/DocBook/Makefile19
-rw-r--r--Documentation/DocBook/alsa-driver-api.tmpl (renamed from Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl)17
-rw-r--r--Documentation/DocBook/genericirq.tmpl1
-rw-r--r--Documentation/DocBook/kernel-api.tmpl9
-rw-r--r--Documentation/DocBook/mac80211.tmpl18
-rw-r--r--Documentation/DocBook/procfs_example.c9
-rw-r--r--Documentation/DocBook/uio-howto.tmpl29
-rw-r--r--Documentation/DocBook/writing-an-alsa-driver.tmpl (renamed from Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl)60
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt814
-rw-r--r--Documentation/PCI/pci-iov-howto.txt99
-rw-r--r--Documentation/RCU/checklist.txt12
-rw-r--r--Documentation/RCU/listRCU.txt6
-rw-r--r--Documentation/RCU/rcu.txt2
-rw-r--r--Documentation/RCU/rculist_nulls.txt4
-rw-r--r--Documentation/Smack.txt42
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Suspend.txt8
-rw-r--r--Documentation/arm/memory.txt9
-rw-r--r--Documentation/block/biodoc.txt19
-rw-r--r--Documentation/block/switching-sched.txt6
-rw-r--r--Documentation/blockdev/00-INDEX2
-rw-r--r--Documentation/blockdev/mflash.txt84
-rw-r--r--Documentation/cgroups/00-INDEX18
-rw-r--r--Documentation/cgroups/cgroups.txt36
-rw-r--r--Documentation/cgroups/cpuacct.txt18
-rw-r--r--Documentation/cgroups/cpusets.txt12
-rw-r--r--Documentation/cgroups/devices.txt2
-rw-r--r--Documentation/cgroups/memcg_test.txt22
-rw-r--r--Documentation/cgroups/memory.txt55
-rw-r--r--Documentation/cgroups/resource_counter.txt27
-rw-r--r--Documentation/cpu-freq/governors.txt26
-rw-r--r--Documentation/cpu-freq/user-guide.txt12
-rw-r--r--Documentation/cputopology.txt6
-rw-r--r--Documentation/devices.txt133
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/driver-model/platform.txt59
-rw-r--r--Documentation/dvb/get_dvb_firmware85
-rw-r--r--Documentation/dynamic-debug-howto.txt240
-rw-r--r--Documentation/fb/00-INDEX2
-rw-r--r--Documentation/fb/cyblafb/bugs13
-rw-r--r--Documentation/fb/cyblafb/credits7
-rw-r--r--Documentation/fb/cyblafb/documentation17
-rw-r--r--Documentation/fb/cyblafb/fb.modes154
-rw-r--r--Documentation/fb/cyblafb/performance79
-rw-r--r--Documentation/fb/cyblafb/todo31
-rw-r--r--Documentation/fb/cyblafb/usage217
-rw-r--r--Documentation/fb/cyblafb/whatsnew29
-rw-r--r--Documentation/fb/cyblafb/whycyblafb85
-rw-r--r--Documentation/fb/uvesafb.txt7
-rw-r--r--Documentation/feature-removal-schedule.txt158
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/Locking33
-rw-r--r--Documentation/filesystems/caching/backend-api.txt658
-rw-r--r--Documentation/filesystems/caching/cachefiles.txt501
-rw-r--r--Documentation/filesystems/caching/fscache.txt333
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt778
-rw-r--r--Documentation/filesystems/caching/object.txt313
-rw-r--r--Documentation/filesystems/caching/operations.txt213
-rw-r--r--Documentation/filesystems/exofs.txt176
-rw-r--r--Documentation/filesystems/ext2.txt9
-rw-r--r--Documentation/filesystems/ext3.txt18
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/knfsd-stats.txt159
-rw-r--r--Documentation/filesystems/nfs41-server.txt161
-rw-r--r--Documentation/filesystems/nilfs2.txt200
-rw-r--r--Documentation/filesystems/pohmelfs/design_notes.txt71
-rw-r--r--Documentation/filesystems/pohmelfs/info.txt99
-rw-r--r--Documentation/filesystems/pohmelfs/network_protocol.txt227
-rw-r--r--Documentation/filesystems/proc.txt1111
-rw-r--r--Documentation/filesystems/squashfs.txt2
-rw-r--r--Documentation/filesystems/sysfs-pci.txt10
-rw-r--r--Documentation/filesystems/udf.txt2
-rw-r--r--Documentation/filesystems/vfs.txt3
-rw-r--r--Documentation/gpio.txt23
-rw-r--r--Documentation/hwmon/ds162151
-rw-r--r--Documentation/hwmon/g760a36
-rw-r--r--Documentation/hwmon/lis3lv02d20
-rw-r--r--Documentation/hwmon/lm9010
-rw-r--r--Documentation/hwmon/ltc421550
-rw-r--r--Documentation/hwmon/pcf8591 (renamed from Documentation/i2c/chips/pcf8591)0
-rw-r--r--Documentation/hwmon/sysfs-interface22
-rw-r--r--Documentation/hwmon/w83627ehf29
-rw-r--r--Documentation/i2c/busses/i2c-nforce212
-rw-r--r--Documentation/i2c/busses/i2c-piix42
-rw-r--r--Documentation/i2c/instantiating-devices167
-rw-r--r--Documentation/i2c/writing-clients19
-rw-r--r--Documentation/ia64/kvm.txt2
-rw-r--r--Documentation/infiniband/ipoib.txt45
-rw-r--r--Documentation/input/bcm5974.txt65
-rw-r--r--Documentation/input/multi-touch-protocol.txt140
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/isdn/00-INDEX17
-rw-r--r--Documentation/isdn/INTERFACE.CAPI213
-rw-r--r--Documentation/isdn/README.gigaset52
-rw-r--r--Documentation/kbuild/makefiles.txt93
-rw-r--r--Documentation/kernel-doc-nano-HOWTO.txt7
-rw-r--r--Documentation/kernel-parameters.txt629
-rw-r--r--Documentation/kprobes.txt38
-rw-r--r--Documentation/laptops/acer-wmi.txt10
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt148
-rw-r--r--Documentation/lguest/.gitignore1
-rw-r--r--Documentation/lguest/lguest.c7
-rw-r--r--Documentation/lguest/lguest.txt11
-rw-r--r--Documentation/lockdep-design.txt36
-rw-r--r--Documentation/md.txt37
-rw-r--r--Documentation/misc-devices/isl2900362
-rw-r--r--Documentation/networking/bonding.txt2
-rw-r--r--Documentation/networking/dccp.txt3
-rw-r--r--Documentation/networking/ip-sysctl.txt148
-rw-r--r--Documentation/networking/ipv6.txt35
-rw-r--r--Documentation/networking/ixgbe.txt199
-rw-r--r--Documentation/networking/rds.txt356
-rw-r--r--Documentation/networking/timestamping.txt180
-rw-r--r--Documentation/networking/timestamping/.gitignore1
-rw-r--r--Documentation/networking/timestamping/Makefile6
-rw-r--r--Documentation/networking/timestamping/timestamping.c533
-rw-r--r--Documentation/networking/vxge.txt100
-rw-r--r--Documentation/powerpc/booting-without-of.txt89
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/dma.txt34
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/esdhc.txt24
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/i2c.txt46
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/ssi.txt68
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/tsec.txt6
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/upm-nand.txt39
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/led.txt46
-rw-r--r--Documentation/powerpc/dts-bindings/mmc-spi-slot.txt23
-rw-r--r--Documentation/powerpc/dts-bindings/mtd-physmap.txt80
-rw-r--r--Documentation/scheduler/00-INDEX2
-rw-r--r--Documentation/scheduler/sched-coding.txt126
-rw-r--r--Documentation/scheduler/sched-rt-group.txt2
-rw-r--r--Documentation/scsi/aacraid.txt15
-rw-r--r--Documentation/scsi/osd.txt198
-rw-r--r--Documentation/slow-work.txt174
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt87
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt21
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt51
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt3
-rw-r--r--Documentation/sound/alsa/soc/jack.txt71
-rw-r--r--Documentation/sound/oss/CS423223
-rw-r--r--Documentation/sound/oss/Introduction2
-rw-r--r--Documentation/sparse.txt8
-rw-r--r--Documentation/spi/spi-summary6
-rw-r--r--Documentation/sysctl/00-INDEX2
-rw-r--r--Documentation/sysctl/fs.txt74
-rw-r--r--Documentation/sysctl/kernel.txt53
-rw-r--r--Documentation/sysctl/net.txt175
-rw-r--r--Documentation/sysctl/vm.txt32
-rw-r--r--Documentation/sysrq.txt7
-rw-r--r--Documentation/tomoyo.txt55
-rw-r--r--Documentation/trace/ftrace.txt (renamed from Documentation/ftrace.txt)1134
-rw-r--r--Documentation/trace/kmemtrace.txt126
-rw-r--r--Documentation/trace/mmiotrace.txt (renamed from Documentation/tracers/mmiotrace.txt)0
-rw-r--r--Documentation/trace/tracepoints.txt (renamed from Documentation/tracepoints.txt)21
-rw-r--r--Documentation/usb/usbmon.txt27
-rw-r--r--Documentation/video4linux/CARDLIST.bttv6
-rw-r--r--Documentation/video4linux/CARDLIST.cx238854
-rw-r--r--Documentation/video4linux/CARDLIST.cx881
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx9
-rw-r--r--Documentation/video4linux/CARDLIST.saa71342
-rw-r--r--Documentation/video4linux/Zoran3
-rw-r--r--Documentation/video4linux/bttv/Insmod-options10
-rw-r--r--Documentation/video4linux/bttv/README4
-rw-r--r--Documentation/video4linux/cx2341x/README.hm124
-rw-r--r--Documentation/video4linux/gspca.txt4
-rw-r--r--Documentation/video4linux/pxa_camera.txt125
-rw-r--r--Documentation/video4linux/si470x.txt11
-rw-r--r--Documentation/video4linux/v4l2-framework.txt202
-rw-r--r--Documentation/video4linux/v4lgrab.c4
-rw-r--r--Documentation/video4linux/zr364xx.txt1
-rw-r--r--Documentation/vm/00-INDEX2
-rw-r--r--Documentation/vm/active_mm.txt83
-rw-r--r--Documentation/vm/numa_memory_policy.txt3
-rw-r--r--Documentation/vm/page_migration3
-rw-r--r--Documentation/vm/unevictable-lru.txt1041
-rw-r--r--Documentation/x86/boot.txt18
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets5
188 files changed, 12323 insertions, 4309 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 2a39aeba1464..d05737aaa84b 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -86,6 +86,8 @@ cachetlb.txt
86 - describes the cache/TLB flushing interfaces Linux uses. 86 - describes the cache/TLB flushing interfaces Linux uses.
87cdrom/ 87cdrom/
88 - directory with information on the CD-ROM drivers that Linux has. 88 - directory with information on the CD-ROM drivers that Linux has.
89cgroups/
90 - cgroups features, including cpusets and memory controller.
89connector/ 91connector/
90 - docs on the netlink based userspace<->kernel space communication mod. 92 - docs on the netlink based userspace<->kernel space communication mod.
91console/ 93console/
@@ -98,8 +100,6 @@ cpu-load.txt
98 - document describing how CPU load statistics are collected. 100 - document describing how CPU load statistics are collected.
99cpuidle/ 101cpuidle/
100 - info on CPU_IDLE, CPU idle state management subsystem. 102 - info on CPU_IDLE, CPU idle state management subsystem.
101cpusets.txt
102 - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
103cputopology.txt 103cputopology.txt
104 - documentation on how CPU topology info is exported via sysfs. 104 - documentation on how CPU topology info is exported via sysfs.
105cris/ 105cris/
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
new file mode 100644
index 000000000000..5e6a92a02d85
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-kmemtrace
@@ -0,0 +1,71 @@
1What: /sys/kernel/debug/kmemtrace/
2Date: July 2008
3Contact: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
4Description:
5
6In kmemtrace-enabled kernels, the following files are created:
7
8/sys/kernel/debug/kmemtrace/
9 cpu<n> (0400) Per-CPU tracing data, see below. (binary)
10 total_overruns (0400) Total number of bytes which were dropped from
11 cpu<n> files because of full buffer condition,
12 non-binary. (text)
13 abi_version (0400) Kernel's kmemtrace ABI version. (text)
14
15Each per-CPU file should be read according to the relay interface. That is,
16the reader should set affinity to that specific CPU and, as currently done by
17the userspace application (though there are other methods), use poll() with
18an infinite timeout before every read(). Otherwise, erroneous data may be
19read. The binary data has the following _core_ format:
20
21 Event ID (1 byte) Unsigned integer, one of:
22 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
23 1 - represents a freeing of previously allocated memory
24 (KMEMTRACE_EVENT_FREE)
25 Type ID (1 byte) Unsigned integer, one of:
26 0 - this is a kmalloc() / kfree()
27 1 - this is a kmem_cache_alloc() / kmem_cache_free()
28 2 - this is a __get_free_pages() et al.
29 Event size (2 bytes) Unsigned integer representing the
30 size of this event. Used to extend
31 kmemtrace. Discard the bytes you
32 don't know about.
33 Sequence number (4 bytes) Signed integer used to reorder data
34 logged on SMP machines. Wraparound
35 must be taken into account, although
36 it is unlikely.
37 Caller address (8 bytes) Return address to the caller.
38 Pointer to mem (8 bytes) Pointer to target memory area. Can be
39 NULL, but not all such calls might be
40 recorded.
41
42In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
43
44 Requested bytes (8 bytes) Total number of requested bytes,
45 unsigned, must not be zero.
46 Allocated bytes (8 bytes) Total number of actually allocated
47 bytes, unsigned, must not be lower
48 than requested bytes.
49 Requested flags (4 bytes) GFP flags supplied by the caller.
50 Target CPU (4 bytes) Signed integer, valid for event id 1.
51 If equal to -1, target CPU is the same
52 as origin CPU, but the reverse might
53 not be true.
54
55The data is made available in the same endianness the machine has.
56
57Other event ids and type ids may be defined and added. Other fields may be
58added by increasing event size, but see below for details.
59Every modification to the ABI, including new id definitions, are followed
60by bumping the ABI version by one.
61
62Adding new data to the packet (features) is done at the end of the mandatory
63data:
64 Feature size (2 byte)
65 Feature ID (1 byte)
66 Feature data (Feature size - 3 bytes)
67
68
69Users:
70 kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
71
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
index bf9c16b64c34..cf11736acb76 100644
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ b/Documentation/ABI/testing/debugfs-pktcdvd
@@ -1,4 +1,4 @@
1What: /debug/pktcdvd/pktcdvd[0-7] 1What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
2Date: Oct. 2006 2Date: Oct. 2006
3KernelVersion: 2.6.20 3KernelVersion: 2.6.20
4Contact: Thomas Maier <balagi@justmail.de> 4Contact: Thomas Maier <balagi@justmail.de>
@@ -10,10 +10,10 @@ debugfs interface
10The pktcdvd module (packet writing driver) creates 10The pktcdvd module (packet writing driver) creates
11these files in debugfs: 11these files in debugfs:
12 12
13/debug/pktcdvd/pktcdvd[0-7]/ 13/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
14 info (0444) Lots of driver statistics and infos. 14 info (0444) Lots of driver statistics and infos.
15 15
16Example: 16Example:
17------- 17-------
18 18
19cat /debug/pktcdvd/pktcdvd0/info 19cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
new file mode 100644
index 000000000000..6434f0df012e
--- /dev/null
+++ b/Documentation/ABI/testing/ima_policy
@@ -0,0 +1,61 @@
1What: security/ima/policy
2Date: May 2008
3Contact: Mimi Zohar <zohar@us.ibm.com>
4Description:
5 The Trusted Computing Group(TCG) runtime Integrity
6 Measurement Architecture(IMA) maintains a list of hash
7 values of executables and other sensitive system files
8 loaded into the run-time of this system. At runtime,
9 the policy can be constrained based on LSM specific data.
10 Policies are loaded into the securityfs file ima/policy
11 by opening the file, writing the rules one at a time and
12 then closing the file. The new policy takes effect after
13 the file ima/policy is closed.
14
15 rule format: action [condition ...]
16
17 action: measure | dont_measure
18 condition:= base | lsm
19 base: [[func=] [mask=] [fsmagic=] [uid=]]
20 lsm: [[subj_user=] [subj_role=] [subj_type=]
21 [obj_user=] [obj_role=] [obj_type=]]
22
23 base: func:= [BPRM_CHECK][FILE_MMAP][INODE_PERMISSION]
24 mask:= [MAY_READ] [MAY_WRITE] [MAY_APPEND] [MAY_EXEC]
25 fsmagic:= hex value
26 uid:= decimal value
27 lsm: are LSM specific
28
29 default policy:
30 # PROC_SUPER_MAGIC
31 dont_measure fsmagic=0x9fa0
32 # SYSFS_MAGIC
33 dont_measure fsmagic=0x62656572
34 # DEBUGFS_MAGIC
35 dont_measure fsmagic=0x64626720
36 # TMPFS_MAGIC
37 dont_measure fsmagic=0x01021994
38 # SECURITYFS_MAGIC
39 dont_measure fsmagic=0x73636673
40
41 measure func=BPRM_CHECK
42 measure func=FILE_MMAP mask=MAY_EXEC
43 measure func=INODE_PERM mask=MAY_READ uid=0
44
45 The default policy measures all executables in bprm_check,
46 all files mmapped executable in file_mmap, and all files
47 open for read by root in inode_permission.
48
49 Examples of LSM specific definitions:
50
51 SELinux:
52 # SELINUX_MAGIC
53 dont_measure fsmagic=0xF97CFF8C
54
55 dont_measure obj_type=var_log_t
56 dont_measure obj_type=auditd_log_t
57 measure subj_user=system_u func=INODE_PERM mask=MAY_READ
58 measure subj_role=system_r func=INODE_PERM mask=MAY_READ
59
60 Smack:
61 measure subj_user=_ func=INODE_PERM mask=MAY_READ
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index e638e15a8895..97ad190e13af 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -41,6 +41,49 @@ Description:
41 for the device and attempt to bind to it. For example: 41 for the device and attempt to bind to it. For example:
42 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id 42 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id
43 43
44What: /sys/bus/pci/drivers/.../remove_id
45Date: February 2009
46Contact: Chris Wright <chrisw@sous-sol.org>
47Description:
48 Writing a device ID to this file will remove an ID
49 that was dynamically added via the new_id sysfs entry.
50 The format for the device ID is:
51 VVVV DDDD SVVV SDDD CCCC MMMM. That is Vendor ID, Device
52 ID, Subsystem Vendor ID, Subsystem Device ID, Class,
53 and Class Mask. The Vendor ID and Device ID fields are
54 required, the rest are optional. After successfully
55 removing an ID, the driver will no longer support the
56 device. This is useful to ensure auto probing won't
57 match the driver to the device. For example:
58 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id
59
60What: /sys/bus/pci/rescan
61Date: January 2009
62Contact: Linux PCI developers <linux-pci@vger.kernel.org>
63Description:
64 Writing a non-zero value to this attribute will
65 force a rescan of all PCI buses in the system, and
66 re-discover previously removed devices.
67 Depends on CONFIG_HOTPLUG.
68
69What: /sys/bus/pci/devices/.../remove
70Date: January 2009
71Contact: Linux PCI developers <linux-pci@vger.kernel.org>
72Description:
73 Writing a non-zero value to this attribute will
74 hot-remove the PCI device and any of its children.
75 Depends on CONFIG_HOTPLUG.
76
77What: /sys/bus/pci/devices/.../rescan
78Date: January 2009
79Contact: Linux PCI developers <linux-pci@vger.kernel.org>
80Description:
81 Writing a non-zero value to this attribute will
82 force a rescan of the device's parent bus and all
83 child buses, and re-discover devices removed earlier
84 from this part of the device tree.
85 Depends on CONFIG_HOTPLUG.
86
44What: /sys/bus/pci/devices/.../vpd 87What: /sys/bus/pci/devices/.../vpd
45Date: February 2008 88Date: February 2008
46Contact: Ben Hutchings <bhutchings@solarflare.com> 89Contact: Ben Hutchings <bhutchings@solarflare.com>
@@ -52,3 +95,30 @@ Description:
52 that some devices may have malformatted data. If the 95 that some devices may have malformatted data. If the
53 underlying VPD has a writable section then the 96 underlying VPD has a writable section then the
54 corresponding section of this file will be writable. 97 corresponding section of this file will be writable.
98
99What: /sys/bus/pci/devices/.../virtfnN
100Date: March 2009
101Contact: Yu Zhao <yu.zhao@intel.com>
102Description:
103 This symbolic link appears when hardware supports the SR-IOV
104 capability and the Physical Function driver has enabled it.
105 The symbolic link points to the PCI device sysfs entry of the
106 Virtual Function whose index is N (0...MaxVFs-1).
107
108What: /sys/bus/pci/devices/.../dep_link
109Date: March 2009
110Contact: Yu Zhao <yu.zhao@intel.com>
111Description:
112 This symbolic link appears when hardware supports the SR-IOV
113 capability and the Physical Function driver has enabled it,
114 and this device has vendor specific dependencies with others.
115 The symbolic link points to the PCI device sysfs entry of
116 Physical Function this device depends on.
117
118What: /sys/bus/pci/devices/.../physfn
119Date: March 2009
120Contact: Yu Zhao <yu.zhao@intel.com>
121Description:
122 This symbolic link appears when a device is a Virtual Function.
123 The symbolic link points to the PCI device sysfs entry of the
124 Physical Function this device associates with.
diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 873ef1fc1569..e091fa873792 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -4,8 +4,8 @@ KernelVersion: 2.6.26
4Contact: Liam Girdwood <lrg@slimlogic.co.uk> 4Contact: Liam Girdwood <lrg@slimlogic.co.uk>
5Description: 5Description:
6 Some regulator directories will contain a field called 6 Some regulator directories will contain a field called
7 state. This reports the regulator enable status, for 7 state. This reports the regulator enable control, for
8 regulators which can report that value. 8 regulators which can report that input value.
9 9
10 This will be one of the following strings: 10 This will be one of the following strings:
11 11
@@ -14,16 +14,54 @@ Description:
14 'unknown' 14 'unknown'
15 15
16 'enabled' means the regulator output is ON and is supplying 16 'enabled' means the regulator output is ON and is supplying
17 power to the system. 17 power to the system (assuming no error prevents it).
18 18
19 'disabled' means the regulator output is OFF and is not 19 'disabled' means the regulator output is OFF and is not
20 supplying power to the system.. 20 supplying power to the system (unless some non-Linux
21 control has enabled it).
21 22
22 'unknown' means software cannot determine the state, or 23 'unknown' means software cannot determine the state, or
23 the reported state is invalid. 24 the reported state is invalid.
24 25
25 NOTE: this field can be used in conjunction with microvolts 26 NOTE: this field can be used in conjunction with microvolts
26 and microamps to determine regulator output levels. 27 or microamps to determine configured regulator output levels.
28
29
30What: /sys/class/regulator/.../status
31Description:
32 Some regulator directories will contain a field called
33 "status". This reports the current regulator status, for
34 regulators which can report that output value.
35
36 This will be one of the following strings:
37
38 off
39 on
40 error
41 fast
42 normal
43 idle
44 standby
45
46 "off" means the regulator is not supplying power to the
47 system.
48
49 "on" means the regulator is supplying power to the system,
50 and the regulator can't report a detailed operation mode.
51
52 "error" indicates an out-of-regulation status such as being
53 disabled due to thermal shutdown, or voltage being unstable
54 because of problems with the input power supply.
55
56 "fast", "normal", "idle", and "standby" are all detailed
57 regulator operation modes (described elsewhere). They
58 imply "on", but provide more detail.
59
60 Note that regulator status is a function of many inputs,
61 not limited to control inputs from Linux. For example,
62 the actual load presented may trigger "error" status; or
63 a regulator may be enabled by another user, even though
64 Linux did not enable it.
27 65
28 66
29What: /sys/class/regulator/.../type 67What: /sys/class/regulator/.../type
@@ -58,7 +96,7 @@ Description:
58 Some regulator directories will contain a field called 96 Some regulator directories will contain a field called
59 microvolts. This holds the regulator output voltage setting 97 microvolts. This holds the regulator output voltage setting
60 measured in microvolts (i.e. E-6 Volts), for regulators 98 measured in microvolts (i.e. E-6 Volts), for regulators
61 which can report that voltage. 99 which can report the control input for voltage.
62 100
63 NOTE: This value should not be used to determine the regulator 101 NOTE: This value should not be used to determine the regulator
64 output voltage level as this value is the same regardless of 102 output voltage level as this value is the same regardless of
@@ -73,7 +111,7 @@ Description:
73 Some regulator directories will contain a field called 111 Some regulator directories will contain a field called
74 microamps. This holds the regulator output current limit 112 microamps. This holds the regulator output current limit
75 setting measured in microamps (i.e. E-6 Amps), for regulators 113 setting measured in microamps (i.e. E-6 Amps), for regulators
76 which can report that current. 114 which can report the control input for a current limit.
77 115
78 NOTE: This value should not be used to determine the regulator 116 NOTE: This value should not be used to determine the regulator
79 output current level as this value is the same regardless of 117 output current level as this value is the same regardless of
@@ -87,7 +125,7 @@ Contact: Liam Girdwood <lrg@slimlogic.co.uk>
87Description: 125Description:
88 Some regulator directories will contain a field called 126 Some regulator directories will contain a field called
89 opmode. This holds the current regulator operating mode, 127 opmode. This holds the current regulator operating mode,
90 for regulators which can report it. 128 for regulators which can report that control input value.
91 129
92 The opmode value can be one of the following strings: 130 The opmode value can be one of the following strings:
93 131
@@ -101,7 +139,8 @@ Description:
101 139
102 NOTE: This value should not be used to determine the regulator 140 NOTE: This value should not be used to determine the regulator
103 output operating mode as this value is the same regardless of 141 output operating mode as this value is the same regardless of
104 whether the regulator is enabled or disabled. 142 whether the regulator is enabled or disabled. A "status"
143 attribute may be available to determine the actual mode.
105 144
106 145
107What: /sys/class/regulator/.../min_microvolts 146What: /sys/class/regulator/.../min_microvolts
diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi
index e8ffc70ffe12..4f9ba3c2fca7 100644
--- a/Documentation/ABI/testing/sysfs-firmware-acpi
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -69,9 +69,13 @@ Description:
69 gpe1F: 0 invalid 69 gpe1F: 0 invalid
70 gpe_all: 1192 70 gpe_all: 1192
71 sci: 1194 71 sci: 1194
72 sci_not: 0
72 73
73 sci - The total number of times the ACPI SCI 74 sci - The number of times the ACPI SCI
74 has claimed an interrupt. 75 has been called and claimed an interrupt.
76
77 sci_not - The number of times the ACPI SCI
78 has been called and NOT claimed an interrupt.
75 79
76 gpe_all - count of SCI caused by GPEs. 80 gpe_all - count of SCI caused by GPEs.
77 81
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 000000000000..4e79074de282
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
1What: /sys/fs/ext4/<disk>/mb_stats
2Date: March 2008
3Contact: "Theodore Ts'o" <tytso@mit.edu>
4Description:
5 Controls whether the multiblock allocator should
6 collect statistics, which are shown during the unmount.
7 1 means to collect statistics, 0 means not to collect
8 statistics
9
10What: /sys/fs/ext4/<disk>/mb_group_prealloc
11Date: March 2008
12Contact: "Theodore Ts'o" <tytso@mit.edu>
13Description:
14 The multiblock allocator will round up allocation
15 requests to a multiple of this tuning parameter if the
16 stripe size is not set in the ext4 superblock
17
18What: /sys/fs/ext4/<disk>/mb_max_to_scan
19Date: March 2008
20Contact: "Theodore Ts'o" <tytso@mit.edu>
21Description:
22 The maximum number of extents the multiblock allocator
23 will search to find the best extent
24
25What: /sys/fs/ext4/<disk>/mb_min_to_scan
26Date: March 2008
27Contact: "Theodore Ts'o" <tytso@mit.edu>
28Description:
29 The minimum number of extents the multiblock allocator
30 will search to find the best extent
31
32What: /sys/fs/ext4/<disk>/mb_order2_req
33Date: March 2008
34Contact: "Theodore Ts'o" <tytso@mit.edu>
35Description:
36 Tuning parameter which controls the minimum size for
37 requests (as a power of 2) where the buddy cache is
38 used
39
40What: /sys/fs/ext4/<disk>/mb_stream_req
41Date: March 2008
42Contact: "Theodore Ts'o" <tytso@mit.edu>
43Description:
44 Files which have fewer blocks than this tunable
45 parameter will have their blocks allocated out of a
46 block group specific preallocation pool, so that small
47 files are packed closely together. Each large file
48 will have its blocks allocated out of its own unique
49 preallocation pool.
50
51What: /sys/fs/ext4/<disk>/inode_readahead
52Date: March 2008
53Contact: "Theodore Ts'o" <tytso@mit.edu>
54Description:
55 Tuning parameter which controls the maximum number of
56 inode table blocks that ext4's inode table readahead
57 algorithm will pre-read into the buffer cache
58
59What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
60Date: March 2008
61Contact: "Theodore Ts'o" <tytso@mit.edu>
62Description:
63 This file is read-only and shows the number of blocks
64 that are dirty in the page cache, but which do not
65 have their location in the filesystem allocated yet.
66
67What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
68Date: March 2008
69Contact: "Theodore Ts'o" <tytso@mit.edu>
70Description:
71 This file is read-only and shows the number of kilobytes
72 of data that have been written to this filesystem since it was
73 created.
74
75What: /sys/fs/ext4/<disk>/session_write_kbytes
76Date: March 2008
77Contact: "Theodore Ts'o" <tytso@mit.edu>
78Description:
79 This file is read-only and shows the number of
80 kilobytes of data that have been written to this
81 filesystem since it was mounted.
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 2a3fcc55e981..d9aa43d78bcc 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -609,3 +609,109 @@ size is the size (and should be a page-sized multiple).
609The return value will be either a pointer to the processor virtual 609The return value will be either a pointer to the processor virtual
610address of the memory, or an error (via PTR_ERR()) if any part of the 610address of the memory, or an error (via PTR_ERR()) if any part of the
611region is occupied. 611region is occupied.
612
613Part III - Debug drivers use of the DMA-API
614-------------------------------------------
615
616The DMA-API as described above as some constraints. DMA addresses must be
617released with the corresponding function with the same size for example. With
618the advent of hardware IOMMUs it becomes more and more important that drivers
619do not violate those constraints. In the worst case such a violation can
620result in data corruption up to destroyed filesystems.
621
622To debug drivers and find bugs in the usage of the DMA-API checking code can
623be compiled into the kernel which will tell the developer about those
624violations. If your architecture supports it you can select the "Enable
625debugging of DMA-API usage" option in your kernel configuration. Enabling this
626option has a performance impact. Do not enable it in production kernels.
627
628If you boot the resulting kernel will contain code which does some bookkeeping
629about what DMA memory was allocated for which device. If this code detects an
630error it prints a warning message with some details into your kernel log. An
631example warning message may look like this:
632
633------------[ cut here ]------------
634WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
635 check_unmap+0x203/0x490()
636Hardware name:
637forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
638 function [device address=0x00000000640444be] [size=66 bytes] [mapped as
639single] [unmapped as page]
640Modules linked in: nfsd exportfs bridge stp llc r8169
641Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
642Call Trace:
643 <IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
644 [<ffffffff80647b70>] _spin_unlock+0x10/0x30
645 [<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
646 [<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
647 [<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
648 [<ffffffff80252f96>] queue_work+0x56/0x60
649 [<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
650 [<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
651 [<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
652 [<ffffffff80235177>] find_busiest_group+0x207/0x8a0
653 [<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
654 [<ffffffff803c7ea3>] check_unmap+0x203/0x490
655 [<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
656 [<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
657 [<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
658 [<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
659 [<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
660 [<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
661 [<ffffffff8020c093>] ret_from_intr+0x0/0xa
662 <EOI> <4>---[ end trace f6435a98e2a38c0e ]---
663
664The driver developer can find the driver and the device including a stacktrace
665of the DMA-API call which caused this warning.
666
667Per default only the first error will result in a warning message. All other
668errors will only silently counted. This limitation exist to prevent the code
669from flooding your kernel log. To support debugging a device driver this can
670be disabled via debugfs. See the debugfs interface documentation below for
671details.
672
673The debugfs directory for the DMA-API debugging code is called dma-api/. In
674this directory the following files can currently be found:
675
676 dma-api/all_errors This file contains a numeric value. If this
677 value is not equal to zero the debugging code
678 will print a warning for every error it finds
679 into the kernel log. Be carefull with this
680 option. It can easily flood your logs.
681
682 dma-api/disabled This read-only file contains the character 'Y'
683 if the debugging code is disabled. This can
684 happen when it runs out of memory or if it was
685 disabled at boot time
686
687 dma-api/error_count This file is read-only and shows the total
688 numbers of errors found.
689
690 dma-api/num_errors The number in this file shows how many
691 warnings will be printed to the kernel log
692 before it stops. This number is initialized to
693 one at system boot and be set by writing into
694 this file
695
696 dma-api/min_free_entries
697 This read-only file can be read to get the
698 minimum number of free dma_debug_entries the
699 allocator has ever seen. If this value goes
700 down to zero the code will disable itself
701 because it is not longer reliable.
702
703 dma-api/num_free_entries
704 The current number of free dma_debug_entries
705 in the allocator.
706
707If you have this code compiled into your kernel it will be enabled by default.
708If you want to boot without the bookkeeping anyway you can provide
709'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
710Notice that you can not enable it again at runtime. You have to reboot to do
711so.
712
713When the code disables itself at runtime this is most likely because it ran
714out of dma_debug_entries. These entries are preallocated at boot. The number
715of preallocated entries is defined per architecture. If it is too low for you
716boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
717architectural default.
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index b2a4d6d244d9..01f24e94bdb6 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -136,7 +136,7 @@ exactly why.
136The standard 32-bit addressing PCI device would do something like 136The standard 32-bit addressing PCI device would do something like
137this: 137this:
138 138
139 if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 139 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
140 printk(KERN_WARNING 140 printk(KERN_WARNING
141 "mydev: No suitable DMA available.\n"); 141 "mydev: No suitable DMA available.\n");
142 goto ignore_this_device; 142 goto ignore_this_device;
@@ -155,9 +155,9 @@ all 64-bits when accessing streaming DMA:
155 155
156 int using_dac; 156 int using_dac;
157 157
158 if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { 158 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
159 using_dac = 1; 159 using_dac = 1;
160 } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 160 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
161 using_dac = 0; 161 using_dac = 0;
162 } else { 162 } else {
163 printk(KERN_WARNING 163 printk(KERN_WARNING
@@ -170,14 +170,14 @@ the case would look like this:
170 170
171 int using_dac, consistent_using_dac; 171 int using_dac, consistent_using_dac;
172 172
173 if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { 173 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
174 using_dac = 1; 174 using_dac = 1;
175 consistent_using_dac = 1; 175 consistent_using_dac = 1;
176 pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); 176 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
177 } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 177 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
178 using_dac = 0; 178 using_dac = 0;
179 consistent_using_dac = 0; 179 consistent_using_dac = 0;
180 pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); 180 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
181 } else { 181 } else {
182 printk(KERN_WARNING 182 printk(KERN_WARNING
183 "mydev: No suitable DMA available.\n"); 183 "mydev: No suitable DMA available.\n");
@@ -192,7 +192,7 @@ check the return value from pci_set_consistent_dma_mask().
192Finally, if your device can only drive the low 24-bits of 192Finally, if your device can only drive the low 24-bits of
193address during PCI bus mastering you might do something like: 193address during PCI bus mastering you might do something like:
194 194
195 if (pci_set_dma_mask(pdev, DMA_24BIT_MASK)) { 195 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
196 printk(KERN_WARNING 196 printk(KERN_WARNING
197 "mydev: 24-bit DMA addressing not available.\n"); 197 "mydev: 24-bit DMA addressing not available.\n");
198 goto ignore_this_device; 198 goto ignore_this_device;
@@ -213,7 +213,7 @@ most specific mask.
213 213
214Here is pseudo-code showing how this might be done: 214Here is pseudo-code showing how this might be done:
215 215
216 #define PLAYBACK_ADDRESS_BITS DMA_32BIT_MASK 216 #define PLAYBACK_ADDRESS_BITS DMA_BIT_MASK(32)
217 #define RECORD_ADDRESS_BITS 0x00ffffff 217 #define RECORD_ADDRESS_BITS 0x00ffffff
218 218
219 struct my_sound_card *card; 219 struct my_sound_card *card;
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore
index c102c02ecf89..c6def352fe39 100644
--- a/Documentation/DocBook/.gitignore
+++ b/Documentation/DocBook/.gitignore
@@ -4,3 +4,7 @@
4*.html 4*.html
5*.9.gz 5*.9.gz
6*.9 6*.9
7*.aux
8*.dvi
9*.log
10*.out
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 1462ed86d40a..b1eb661e6302 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -12,7 +12,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
12 kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ 12 kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \
13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ 13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ 14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
15 mac80211.xml debugobjects.xml sh.xml regulator.xml 15 mac80211.xml debugobjects.xml sh.xml regulator.xml \
16 alsa-driver-api.xml writing-an-alsa-driver.xml
16 17
17### 18###
18# The build process is as follows (targets): 19# The build process is as follows (targets):
@@ -30,7 +31,7 @@ PS_METHOD = $(prefer-db2x)
30 31
31### 32###
32# The targets that may be used. 33# The targets that may be used.
33PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs 34PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs
34 35
35BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) 36BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
36xmldocs: $(BOOKS) 37xmldocs: $(BOOKS)
@@ -142,7 +143,8 @@ quiet_cmd_db2pdf = PDF $@
142 $(call cmd,db2pdf) 143 $(call cmd,db2pdf)
143 144
144 145
145main_idx = Documentation/DocBook/index.html 146index = index.html
147main_idx = Documentation/DocBook/$(index)
146build_main_index = rm -rf $(main_idx) && \ 148build_main_index = rm -rf $(main_idx) && \
147 echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \ 149 echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
148 echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \ 150 echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
@@ -212,11 +214,12 @@ silent_gen_xml = :
212dochelp: 214dochelp:
213 @echo ' Linux kernel internal documentation in different formats:' 215 @echo ' Linux kernel internal documentation in different formats:'
214 @echo ' htmldocs - HTML' 216 @echo ' htmldocs - HTML'
215 @echo ' installmandocs - install man pages generated by mandocs'
216 @echo ' mandocs - man pages'
217 @echo ' pdfdocs - PDF' 217 @echo ' pdfdocs - PDF'
218 @echo ' psdocs - Postscript' 218 @echo ' psdocs - Postscript'
219 @echo ' xmldocs - XML DocBook' 219 @echo ' xmldocs - XML DocBook'
220 @echo ' mandocs - man pages'
221 @echo ' installmandocs - install man pages generated by mandocs'
222 @echo ' cleandocs - clean all generated DocBook files'
220 223
221### 224###
222# Temporary files left by various tools 225# Temporary files left by various tools
@@ -230,10 +233,14 @@ clean-files := $(DOCBOOKS) \
230 $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \ 233 $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
231 $(patsubst %.xml, %.html, $(DOCBOOKS)) \ 234 $(patsubst %.xml, %.html, $(DOCBOOKS)) \
232 $(patsubst %.xml, %.9, $(DOCBOOKS)) \ 235 $(patsubst %.xml, %.9, $(DOCBOOKS)) \
233 $(C-procfs-example) 236 $(C-procfs-example) $(index)
234 237
235clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man 238clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
236 239
240cleandocs:
241 $(Q)rm -f $(call objectify, $(clean-files))
242 $(Q)rm -rf $(call objectify, $(clean-dirs))
243
237# Declare the contents of the .PHONY variable as phony. We keep that 244# Declare the contents of the .PHONY variable as phony. We keep that
238# information in a variable se we can use it in if_changed and friends. 245# information in a variable se we can use it in if_changed and friends.
239 246
diff --git a/Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl b/Documentation/DocBook/alsa-driver-api.tmpl
index 9d644f7e241e..0230a96f0564 100644
--- a/Documentation/sound/alsa/DocBook/alsa-driver-api.tmpl
+++ b/Documentation/DocBook/alsa-driver-api.tmpl
@@ -1,11 +1,11 @@
1<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN"> 1<?xml version="1.0" encoding="UTF-8"?>
2 2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3<book> 3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4<?dbhtml filename="index.html">
5 4
6<!-- ****************************************************** --> 5<!-- ****************************************************** -->
7<!-- Header --> 6<!-- Header -->
8<!-- ****************************************************** --> 7<!-- ****************************************************** -->
8<book id="ALSA-Driver-API">
9 <bookinfo> 9 <bookinfo>
10 <title>The ALSA Driver API</title> 10 <title>The ALSA Driver API</title>
11 11
@@ -35,6 +35,8 @@
35 35
36 </bookinfo> 36 </bookinfo>
37 37
38<toc></toc>
39
38 <chapter><title>Management of Cards and Devices</title> 40 <chapter><title>Management of Cards and Devices</title>
39 <sect1><title>Card Management</title> 41 <sect1><title>Card Management</title>
40!Esound/core/init.c 42!Esound/core/init.c
@@ -71,6 +73,10 @@
71!Esound/pci/ac97/ac97_codec.c 73!Esound/pci/ac97/ac97_codec.c
72!Esound/pci/ac97/ac97_pcm.c 74!Esound/pci/ac97/ac97_pcm.c
73 </sect1> 75 </sect1>
76 <sect1><title>Virtual Master Control API</title>
77!Esound/core/vmaster.c
78!Iinclude/sound/control.h
79 </sect1>
74 </chapter> 80 </chapter>
75 <chapter><title>MIDI API</title> 81 <chapter><title>MIDI API</title>
76 <sect1><title>Raw MIDI API</title> 82 <sect1><title>Raw MIDI API</title>
@@ -89,6 +95,9 @@
89 <sect1><title>Hardware-Dependent Devices API</title> 95 <sect1><title>Hardware-Dependent Devices API</title>
90!Esound/core/hwdep.c 96!Esound/core/hwdep.c
91 </sect1> 97 </sect1>
98 <sect1><title>Jack Abstraction Layer API</title>
99!Esound/core/jack.c
100 </sect1>
92 <sect1><title>ISA DMA Helpers</title> 101 <sect1><title>ISA DMA Helpers</title>
93!Esound/core/isadma.c 102!Esound/core/isadma.c
94 </sect1> 103 </sect1>
diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl
index 3a882d9a90a9..c671a0168096 100644
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -440,6 +440,7 @@ desc->chip->end();
440 used in the generic IRQ layer. 440 used in the generic IRQ layer.
441 </para> 441 </para>
442!Iinclude/linux/irq.h 442!Iinclude/linux/irq.h
443!Iinclude/linux/interrupt.h
443 </chapter> 444 </chapter>
444 445
445 <chapter id="pubfunctions"> 446 <chapter id="pubfunctions">
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index bc962cda6504..44b3def961a2 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -190,15 +190,20 @@ X!Ekernel/module.c
190!Edrivers/pci/pci.c 190!Edrivers/pci/pci.c
191!Edrivers/pci/pci-driver.c 191!Edrivers/pci/pci-driver.c
192!Edrivers/pci/remove.c 192!Edrivers/pci/remove.c
193!Edrivers/pci/pci-acpi.c
194!Edrivers/pci/search.c 193!Edrivers/pci/search.c
195!Edrivers/pci/msi.c 194!Edrivers/pci/msi.c
196!Edrivers/pci/bus.c 195!Edrivers/pci/bus.c
196!Edrivers/pci/access.c
197!Edrivers/pci/irq.c
198!Edrivers/pci/htirq.c
197<!-- FIXME: Removed for now since no structured comments in source 199<!-- FIXME: Removed for now since no structured comments in source
198X!Edrivers/pci/hotplug.c 200X!Edrivers/pci/hotplug.c
199--> 201-->
200!Edrivers/pci/probe.c 202!Edrivers/pci/probe.c
203!Edrivers/pci/slot.c
201!Edrivers/pci/rom.c 204!Edrivers/pci/rom.c
205!Edrivers/pci/iov.c
206!Idrivers/pci/pci-sysfs.c
202 </sect1> 207 </sect1>
203 <sect1><title>PCI Hotplug Support Library</title> 208 <sect1><title>PCI Hotplug Support Library</title>
204!Edrivers/pci/hotplug/pci_hotplug_core.c 209!Edrivers/pci/hotplug/pci_hotplug_core.c
@@ -258,7 +263,7 @@ X!Earch/x86/kernel/mca_32.c
258!Eblock/blk-tag.c 263!Eblock/blk-tag.c
259!Iblock/blk-tag.c 264!Iblock/blk-tag.c
260!Eblock/blk-integrity.c 265!Eblock/blk-integrity.c
261!Iblock/blktrace.c 266!Ikernel/trace/blktrace.c
262!Iblock/genhd.c 267!Iblock/genhd.c
263!Eblock/genhd.c 268!Eblock/genhd.c
264 </chapter> 269 </chapter>
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index 77c3c202991b..fbeaffc1dcc3 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -17,8 +17,7 @@
17 </authorgroup> 17 </authorgroup>
18 18
19 <copyright> 19 <copyright>
20 <year>2007</year> 20 <year>2007-2009</year>
21 <year>2008</year>
22 <holder>Johannes Berg</holder> 21 <holder>Johannes Berg</holder>
23 </copyright> 22 </copyright>
24 23
@@ -165,8 +164,8 @@ usage should require reading the full document.
165!Pinclude/net/mac80211.h Frame format 164!Pinclude/net/mac80211.h Frame format
166 </sect1> 165 </sect1>
167 <sect1> 166 <sect1>
168 <title>Alignment issues</title> 167 <title>Packet alignment</title>
169 <para>TBD</para> 168!Pnet/mac80211/rx.c Packet alignment
170 </sect1> 169 </sect1>
171 <sect1> 170 <sect1>
172 <title>Calling into mac80211 from interrupts</title> 171 <title>Calling into mac80211 from interrupts</title>
@@ -223,6 +222,17 @@ usage should require reading the full document.
223!Finclude/net/mac80211.h ieee80211_key_flags 222!Finclude/net/mac80211.h ieee80211_key_flags
224 </chapter> 223 </chapter>
225 224
225 <chapter id="powersave">
226 <title>Powersave support</title>
227!Pinclude/net/mac80211.h Powersave support
228 </chapter>
229
230 <chapter id="beacon-filter">
231 <title>Beacon filter support</title>
232!Pinclude/net/mac80211.h Beacon filter support
233!Finclude/net/mac80211.h ieee80211_beacon_loss
234 </chapter>
235
226 <chapter id="qos"> 236 <chapter id="qos">
227 <title>Multiple queues and QoS support</title> 237 <title>Multiple queues and QoS support</title>
228 <para>TBD</para> 238 <para>TBD</para>
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
index 8c6396e4bf31..a5b11793b1e0 100644
--- a/Documentation/DocBook/procfs_example.c
+++ b/Documentation/DocBook/procfs_example.c
@@ -117,9 +117,6 @@ static int __init init_procfs_example(void)
117 rv = -ENOMEM; 117 rv = -ENOMEM;
118 goto out; 118 goto out;
119 } 119 }
120
121 example_dir->owner = THIS_MODULE;
122
123 /* create jiffies using convenience function */ 120 /* create jiffies using convenience function */
124 jiffies_file = create_proc_read_entry("jiffies", 121 jiffies_file = create_proc_read_entry("jiffies",
125 0444, example_dir, 122 0444, example_dir,
@@ -130,8 +127,6 @@ static int __init init_procfs_example(void)
130 goto no_jiffies; 127 goto no_jiffies;
131 } 128 }
132 129
133 jiffies_file->owner = THIS_MODULE;
134
135 /* create foo and bar files using same callback 130 /* create foo and bar files using same callback
136 * functions 131 * functions
137 */ 132 */
@@ -146,7 +141,6 @@ static int __init init_procfs_example(void)
146 foo_file->data = &foo_data; 141 foo_file->data = &foo_data;
147 foo_file->read_proc = proc_read_foobar; 142 foo_file->read_proc = proc_read_foobar;
148 foo_file->write_proc = proc_write_foobar; 143 foo_file->write_proc = proc_write_foobar;
149 foo_file->owner = THIS_MODULE;
150 144
151 bar_file = create_proc_entry("bar", 0644, example_dir); 145 bar_file = create_proc_entry("bar", 0644, example_dir);
152 if(bar_file == NULL) { 146 if(bar_file == NULL) {
@@ -159,7 +153,6 @@ static int __init init_procfs_example(void)
159 bar_file->data = &bar_data; 153 bar_file->data = &bar_data;
160 bar_file->read_proc = proc_read_foobar; 154 bar_file->read_proc = proc_read_foobar;
161 bar_file->write_proc = proc_write_foobar; 155 bar_file->write_proc = proc_write_foobar;
162 bar_file->owner = THIS_MODULE;
163 156
164 /* create symlink */ 157 /* create symlink */
165 symlink = proc_symlink("jiffies_too", example_dir, 158 symlink = proc_symlink("jiffies_too", example_dir,
@@ -169,8 +162,6 @@ static int __init init_procfs_example(void)
169 goto no_symlink; 162 goto no_symlink;
170 } 163 }
171 164
172 symlink->owner = THIS_MODULE;
173
174 /* everything OK */ 165 /* everything OK */
175 printk(KERN_INFO "%s %s initialised\n", 166 printk(KERN_INFO "%s %s initialised\n",
176 MODULE_NAME, MODULE_VERS); 167 MODULE_NAME, MODULE_VERS);
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
index 52e1b79ce0e6..8f6e3b2403c7 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -42,6 +42,13 @@ GPL version 2.
42 42
43<revhistory> 43<revhistory>
44 <revision> 44 <revision>
45 <revnumber>0.8</revnumber>
46 <date>2008-12-24</date>
47 <authorinitials>hjk</authorinitials>
48 <revremark>Added name attributes in mem and portio sysfs directories.
49 </revremark>
50 </revision>
51 <revision>
45 <revnumber>0.7</revnumber> 52 <revnumber>0.7</revnumber>
46 <date>2008-12-23</date> 53 <date>2008-12-23</date>
47 <authorinitials>hjk</authorinitials> 54 <authorinitials>hjk</authorinitials>
@@ -303,12 +310,19 @@ interested in translating it, please email me
303 appear if the size of the mapping is not 0. 310 appear if the size of the mapping is not 0.
304</para> 311</para>
305<para> 312<para>
306 Each <filename>mapX/</filename> directory contains two read-only files 313 Each <filename>mapX/</filename> directory contains four read-only files
307 that show start address and size of the memory: 314 that show attributes of the memory:
308</para> 315</para>
309<itemizedlist> 316<itemizedlist>
310<listitem> 317<listitem>
311 <para> 318 <para>
319 <filename>name</filename>: A string identifier for this mapping. This
320 is optional, the string can be empty. Drivers can set this to make it
321 easier for userspace to find the correct mapping.
322 </para>
323</listitem>
324<listitem>
325 <para>
312 <filename>addr</filename>: The address of memory that can be mapped. 326 <filename>addr</filename>: The address of memory that can be mapped.
313 </para> 327 </para>
314</listitem> 328</listitem>
@@ -366,12 +380,19 @@ offset = N * getpagesize();
366 <filename>/sys/class/uio/uioX/portio/</filename>. 380 <filename>/sys/class/uio/uioX/portio/</filename>.
367</para> 381</para>
368<para> 382<para>
369 Each <filename>portX/</filename> directory contains three read-only 383 Each <filename>portX/</filename> directory contains four read-only
370 files that show start, size, and type of the port region: 384 files that show name, start, size, and type of the port region:
371</para> 385</para>
372<itemizedlist> 386<itemizedlist>
373<listitem> 387<listitem>
374 <para> 388 <para>
389 <filename>name</filename>: A string identifier for this port region.
390 The string is optional and can be empty. Drivers can set it to make it
391 easier for userspace to find a certain port region.
392 </para>
393</listitem>
394<listitem>
395 <para>
375 <filename>start</filename>: The first port of this region. 396 <filename>start</filename>: The first port of this region.
376 </para> 397 </para>
377</listitem> 398</listitem>
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl
index 87a7c07ab658..7a2e0e98986a 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -1,11 +1,11 @@
1<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.1//EN"> 1<?xml version="1.0" encoding="UTF-8"?>
2 2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3<book> 3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4<?dbhtml filename="index.html">
5 4
6<!-- ****************************************************** --> 5<!-- ****************************************************** -->
7<!-- Header --> 6<!-- Header -->
8<!-- ****************************************************** --> 7<!-- ****************************************************** -->
8<book id="Writing-an-ALSA-Driver">
9 <bookinfo> 9 <bookinfo>
10 <title>Writing an ALSA Driver</title> 10 <title>Writing an ALSA Driver</title>
11 <author> 11 <author>
@@ -492,9 +492,9 @@
492 } 492 }
493 493
494 /* (2) */ 494 /* (2) */
495 card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0); 495 err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
496 if (card == NULL) 496 if (err < 0)
497 return -ENOMEM; 497 return err;
498 498
499 /* (3) */ 499 /* (3) */
500 err = snd_mychip_create(card, pci, &chip); 500 err = snd_mychip_create(card, pci, &chip);
@@ -590,8 +590,9 @@
590 <programlisting> 590 <programlisting>
591<![CDATA[ 591<![CDATA[
592 struct snd_card *card; 592 struct snd_card *card;
593 int err;
593 .... 594 ....
594 card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0); 595 err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
595]]> 596]]>
596 </programlisting> 597 </programlisting>
597 </informalexample> 598 </informalexample>
@@ -809,26 +810,28 @@
809 810
810 <para> 811 <para>
811 As mentioned above, to create a card instance, call 812 As mentioned above, to create a card instance, call
812 <function>snd_card_new()</function>. 813 <function>snd_card_create()</function>.
813 814
814 <informalexample> 815 <informalexample>
815 <programlisting> 816 <programlisting>
816<![CDATA[ 817<![CDATA[
817 struct snd_card *card; 818 struct snd_card *card;
818 card = snd_card_new(index, id, module, extra_size); 819 int err;
820 err = snd_card_create(index, id, module, extra_size, &card);
819]]> 821]]>
820 </programlisting> 822 </programlisting>
821 </informalexample> 823 </informalexample>
822 </para> 824 </para>
823 825
824 <para> 826 <para>
825 The function takes four arguments, the card-index number, the 827 The function takes five arguments, the card-index number, the
826 id string, the module pointer (usually 828 id string, the module pointer (usually
827 <constant>THIS_MODULE</constant>), 829 <constant>THIS_MODULE</constant>),
828 and the size of extra-data space. The last argument is used to 830 the size of extra-data space, and the pointer to return the
831 card instance. The extra_size argument is used to
829 allocate card-&gt;private_data for the 832 allocate card-&gt;private_data for the
830 chip-specific data. Note that these data 833 chip-specific data. Note that these data
831 are allocated by <function>snd_card_new()</function>. 834 are allocated by <function>snd_card_create()</function>.
832 </para> 835 </para>
833 </section> 836 </section>
834 837
@@ -915,15 +918,16 @@
915 </para> 918 </para>
916 919
917 <section id="card-management-chip-specific-snd-card-new"> 920 <section id="card-management-chip-specific-snd-card-new">
918 <title>1. Allocating via <function>snd_card_new()</function>.</title> 921 <title>1. Allocating via <function>snd_card_create()</function>.</title>
919 <para> 922 <para>
920 As mentioned above, you can pass the extra-data-length 923 As mentioned above, you can pass the extra-data-length
921 to the 4th argument of <function>snd_card_new()</function>, i.e. 924 to the 4th argument of <function>snd_card_create()</function>, i.e.
922 925
923 <informalexample> 926 <informalexample>
924 <programlisting> 927 <programlisting>
925<![CDATA[ 928<![CDATA[
926 card = snd_card_new(index[dev], id[dev], THIS_MODULE, sizeof(struct mychip)); 929 err = snd_card_create(index[dev], id[dev], THIS_MODULE,
930 sizeof(struct mychip), &card);
927]]> 931]]>
928 </programlisting> 932 </programlisting>
929 </informalexample> 933 </informalexample>
@@ -952,8 +956,8 @@
952 956
953 <para> 957 <para>
954 After allocating a card instance via 958 After allocating a card instance via
955 <function>snd_card_new()</function> (with 959 <function>snd_card_create()</function> (with
956 <constant>NULL</constant> on the 4th arg), call 960 <constant>0</constant> on the 4th arg), call
957 <function>kzalloc()</function>. 961 <function>kzalloc()</function>.
958 962
959 <informalexample> 963 <informalexample>
@@ -961,7 +965,7 @@
961<![CDATA[ 965<![CDATA[
962 struct snd_card *card; 966 struct snd_card *card;
963 struct mychip *chip; 967 struct mychip *chip;
964 card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL); 968 err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
965 ..... 969 .....
966 chip = kzalloc(sizeof(*chip), GFP_KERNEL); 970 chip = kzalloc(sizeof(*chip), GFP_KERNEL);
967]]> 971]]>
@@ -1133,8 +1137,8 @@
1133 if (err < 0) 1137 if (err < 0)
1134 return err; 1138 return err;
1135 /* check PCI availability (28bit DMA) */ 1139 /* check PCI availability (28bit DMA) */
1136 if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || 1140 if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
1137 pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { 1141 pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
1138 printk(KERN_ERR "error to set 28bit mask DMA\n"); 1142 printk(KERN_ERR "error to set 28bit mask DMA\n");
1139 pci_disable_device(pci); 1143 pci_disable_device(pci);
1140 return -ENXIO; 1144 return -ENXIO;
@@ -1248,8 +1252,8 @@
1248 err = pci_enable_device(pci); 1252 err = pci_enable_device(pci);
1249 if (err < 0) 1253 if (err < 0)
1250 return err; 1254 return err;
1251 if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || 1255 if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
1252 pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { 1256 pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
1253 printk(KERN_ERR "error to set 28bit mask DMA\n"); 1257 printk(KERN_ERR "error to set 28bit mask DMA\n");
1254 pci_disable_device(pci); 1258 pci_disable_device(pci);
1255 return -ENXIO; 1259 return -ENXIO;
@@ -5750,8 +5754,9 @@ struct _snd_pcm_runtime {
5750 .... 5754 ....
5751 struct snd_card *card; 5755 struct snd_card *card;
5752 struct mychip *chip; 5756 struct mychip *chip;
5757 int err;
5753 .... 5758 ....
5754 card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL); 5759 err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
5755 .... 5760 ....
5756 chip = kzalloc(sizeof(*chip), GFP_KERNEL); 5761 chip = kzalloc(sizeof(*chip), GFP_KERNEL);
5757 .... 5762 ....
@@ -5763,7 +5768,7 @@ struct _snd_pcm_runtime {
5763 </informalexample> 5768 </informalexample>
5764 5769
5765 When you created the chip data with 5770 When you created the chip data with
5766 <function>snd_card_new()</function>, it's anyway accessible 5771 <function>snd_card_create()</function>, it's anyway accessible
5767 via <structfield>private_data</structfield> field. 5772 via <structfield>private_data</structfield> field.
5768 5773
5769 <informalexample> 5774 <informalexample>
@@ -5775,9 +5780,10 @@ struct _snd_pcm_runtime {
5775 .... 5780 ....
5776 struct snd_card *card; 5781 struct snd_card *card;
5777 struct mychip *chip; 5782 struct mychip *chip;
5783 int err;
5778 .... 5784 ....
5779 card = snd_card_new(index[dev], id[dev], THIS_MODULE, 5785 err = snd_card_create(index[dev], id[dev], THIS_MODULE,
5780 sizeof(struct mychip)); 5786 sizeof(struct mychip), &card);
5781 .... 5787 ....
5782 chip = card->private_data; 5788 chip = card->private_data;
5783 .... 5789 ....
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 256defd7e174..dcf7acc720e1 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -4,506 +4,356 @@
4 Revised Feb 12, 2004 by Martine Silbermann 4 Revised Feb 12, 2004 by Martine Silbermann
5 email: Martine.Silbermann@hp.com 5 email: Martine.Silbermann@hp.com
6 Revised Jun 25, 2004 by Tom L Nguyen 6 Revised Jun 25, 2004 by Tom L Nguyen
7 Revised Jul 9, 2008 by Matthew Wilcox <willy@linux.intel.com>
8 Copyright 2003, 2008 Intel Corporation
7 9
81. About this guide 101. About this guide
9 11
10This guide describes the basics of Message Signaled Interrupts (MSI), 12This guide describes the basics of Message Signaled Interrupts (MSIs),
11the advantages of using MSI over traditional interrupt mechanisms, 13the advantages of using MSI over traditional interrupt mechanisms, how
12and how to enable your driver to use MSI or MSI-X. Also included is 14to change your driver to use MSI or MSI-X and some basic diagnostics to
13a Frequently Asked Questions (FAQ) section. 15try if a device doesn't support MSIs.
14
151.1 Terminology
16
17PCI devices can be single-function or multi-function. In either case,
18when this text talks about enabling or disabling MSI on a "device
19function," it is referring to one specific PCI device and function and
20not to all functions on a PCI device (unless the PCI device has only
21one function).
22
232. Copyright 2003 Intel Corporation
24
253. What is MSI/MSI-X?
26
27Message Signaled Interrupt (MSI), as described in the PCI Local Bus
28Specification Revision 2.3 or later, is an optional feature, and a
29required feature for PCI Express devices. MSI enables a device function
30to request service by sending an Inbound Memory Write on its PCI bus to
31the FSB as a Message Signal Interrupt transaction. Because MSI is
32generated in the form of a Memory Write, all transaction conditions,
33such as a Retry, Master-Abort, Target-Abort or normal completion, are
34supported.
35
36A PCI device that supports MSI must also support pin IRQ assertion
37interrupt mechanism to provide backward compatibility for systems that
38do not support MSI. In systems which support MSI, the bus driver is
39responsible for initializing the message address and message data of
40the device function's MSI/MSI-X capability structure during device
41initial configuration.
42
43An MSI capable device function indicates MSI support by implementing
44the MSI/MSI-X capability structure in its PCI capability list. The
45device function may implement both the MSI capability structure and
46the MSI-X capability structure; however, the bus driver should not
47enable both.
48
49The MSI capability structure contains Message Control register,
50Message Address register and Message Data register. These registers
51provide the bus driver control over MSI. The Message Control register
52indicates the MSI capability supported by the device. The Message
53Address register specifies the target address and the Message Data
54register specifies the characteristics of the message. To request
55service, the device function writes the content of the Message Data
56register to the target address. The device and its software driver
57are prohibited from writing to these registers.
58
59The MSI-X capability structure is an optional extension to MSI. It
60uses an independent and separate capability structure. There are
61some key advantages to implementing the MSI-X capability structure
62over the MSI capability structure as described below.
63
64 - Support a larger maximum number of vectors per function.
65
66 - Provide the ability for system software to configure
67 each vector with an independent message address and message
68 data, specified by a table that resides in Memory Space.
69
70 - MSI and MSI-X both support per-vector masking. Per-vector
71 masking is an optional extension of MSI but a required
72 feature for MSI-X. Per-vector masking provides the kernel the
73 ability to mask/unmask a single MSI while running its
74 interrupt service routine. If per-vector masking is
75 not supported, then the device driver should provide the
76 hardware/software synchronization to ensure that the device
77 generates MSI when the driver wants it to do so.
78
794. Why use MSI?
80
81As a benefit to the simplification of board design, MSI allows board
82designers to remove out-of-band interrupt routing. MSI is another
83step towards a legacy-free environment.
84
85Due to increasing pressure on chipset and processor packages to
86reduce pin count, the need for interrupt pins is expected to
87diminish over time. Devices, due to pin constraints, may implement
88messages to increase performance.
89
90PCI Express endpoints uses INTx emulation (in-band messages) instead
91of IRQ pin assertion. Using INTx emulation requires interrupt
92sharing among devices connected to the same node (PCI bridge) while
93MSI is unique (non-shared) and does not require BIOS configuration
94support. As a result, the PCI Express technology requires MSI
95support for better interrupt performance.
96
97Using MSI enables the device functions to support two or more
98vectors, which can be configured to target different CPUs to
99increase scalability.
100
1015. Configuring a driver to use MSI/MSI-X
102
103By default, the kernel will not enable MSI/MSI-X on all devices that
104support this capability. The CONFIG_PCI_MSI kernel option
105must be selected to enable MSI/MSI-X support.
106
1075.1 Including MSI/MSI-X support into the kernel
108
109To allow MSI/MSI-X capable device drivers to selectively enable
110MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described
111below), the VECTOR based scheme needs to be enabled by setting
112CONFIG_PCI_MSI during kernel config.
113
114Since the target of the inbound message is the local APIC, providing
115CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI.
116
1175.2 Configuring for MSI support
118
119Due to the non-contiguous fashion in vector assignment of the
120existing Linux kernel, this version does not support multiple
121messages regardless of a device function is capable of supporting
122more than one vector. To enable MSI on a device function's MSI
123capability structure requires a device driver to call the function
124pci_enable_msi() explicitly.
125
1265.2.1 API pci_enable_msi
127 16
128int pci_enable_msi(struct pci_dev *dev)
129 17
130With this new API, a device driver that wants to have MSI 182. What are MSIs?
131enabled on its device function must call this API to enable MSI.
132A successful call will initialize the MSI capability structure
133with ONE vector, regardless of whether a device function is
134capable of supporting multiple messages. This vector replaces the
135pre-assigned dev->irq with a new MSI vector. To avoid a conflict
136of the new assigned vector with existing pre-assigned vector requires
137a device driver to call this API before calling request_irq().
138 19
1395.2.2 API pci_disable_msi 20A Message Signaled Interrupt is a write from the device to a special
21address which causes an interrupt to be received by the CPU.
140 22
141void pci_disable_msi(struct pci_dev *dev) 23The MSI capability was first specified in PCI 2.2 and was later enhanced
24in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X
25capability was also introduced with PCI 3.0. It supports more interrupts
26per device than MSI and allows interrupts to be independently configured.
142 27
143This API should always be used to undo the effect of pci_enable_msi() 28Devices may support both MSI and MSI-X, but only one can be enabled at
144when a device driver is unloading. This API restores dev->irq with 29a time.
145the pre-assigned IOAPIC vector and switches a device's interrupt
146mode to PCI pin-irq assertion/INTx emulation mode.
147
148Note that a device driver should always call free_irq() on the MSI vector
149that it has done request_irq() on before calling this API. Failure to do
150so results in a BUG_ON() and a device will be left with MSI enabled and
151leaks its vector.
152
1535.2.3 MSI mode vs. legacy mode diagram
154
155The below diagram shows the events which switch the interrupt
156mode on the MSI-capable device function between MSI mode and
157PIN-IRQ assertion mode.
158
159 ------------ pci_enable_msi ------------------------
160 | | <=============== | |
161 | MSI MODE | | PIN-IRQ ASSERTION MODE |
162 | | ===============> | |
163 ------------ pci_disable_msi ------------------------
164
165
166Figure 1. MSI Mode vs. Legacy Mode
167
168In Figure 1, a device operates by default in legacy mode. Legacy
169in this context means PCI pin-irq assertion or PCI-Express INTx
170emulation. A successful MSI request (using pci_enable_msi()) switches
171a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
172stored in dev->irq will be saved by the PCI subsystem and a new
173assigned MSI vector will replace dev->irq.
174
175To return back to its default mode, a device driver should always call
176pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
177device driver should always call free_irq() on the MSI vector it has
178done request_irq() on before calling pci_disable_msi(). Failure to do
179so results in a BUG_ON() and a device will be left with MSI enabled and
180leaks its vector. Otherwise, the PCI subsystem restores a device's
181dev->irq with a pre-assigned IOAPIC vector and marks the released
182MSI vector as unused.
183
184Once being marked as unused, there is no guarantee that the PCI
185subsystem will reserve this MSI vector for a device. Depending on
186the availability of current PCI vector resources and the number of
187MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
188
189For the case where the PCI subsystem re-assigns this MSI vector to
190another driver, a request to switch back to MSI mode may result
191in being assigned a different MSI vector or a failure if no more
192vectors are available.
193
1945.3 Configuring for MSI-X support
195
196Due to the ability of the system software to configure each vector of
197the MSI-X capability structure with an independent message address
198and message data, the non-contiguous fashion in vector assignment of
199the existing Linux kernel has no impact on supporting multiple
200messages on an MSI-X capable device functions. To enable MSI-X on
201a device function's MSI-X capability structure requires its device
202driver to call the function pci_enable_msix() explicitly.
203
204The function pci_enable_msix(), once invoked, enables either
205all or nothing, depending on the current availability of PCI vector
206resources. If the PCI vector resources are available for the number
207of vectors requested by a device driver, this function will configure
208the MSI-X table of the MSI-X capability structure of a device with
209requested messages. To emphasize this reason, for example, a device
210may be capable for supporting the maximum of 32 vectors while its
211software driver usually may request 4 vectors. It is recommended
212that the device driver should call this function once during the
213initialization phase of the device driver.
214
215Unlike the function pci_enable_msi(), the function pci_enable_msix()
216does not replace the pre-assigned IOAPIC dev->irq with a new MSI
217vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
218into the field vector of each element contained in a second argument.
219Note that the pre-assigned IOAPIC dev->irq is valid only if the device
220operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
221using dev->irq by the device driver to request for interrupt service
222may result in unpredictable behavior.
223
224For each MSI-X vector granted, a device driver is responsible for calling
225other functions like request_irq(), enable_irq(), etc. to enable
226this vector with its corresponding interrupt service handler. It is
227a device driver's choice to assign all vectors with the same
228interrupt service handler or each vector with a unique interrupt
229service handler.
230
2315.3.1 Handling MMIO address space of MSI-X Table
232
233The PCI 3.0 specification has implementation notes that MMIO address
234space for a device's MSI-X structure should be isolated so that the
235software system can set different pages for controlling accesses to the
236MSI-X structure. The implementation of MSI support requires the PCI
237subsystem, not a device driver, to maintain full control of the MSI-X
238table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
239table/MSI-X PBA. A device driver should not access the MMIO address
240space of the MSI-X table/MSI-X PBA.
241
2425.3.2 API pci_enable_msix
243 30
244int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
245 31
246This API enables a device driver to request the PCI subsystem 323. Why use MSIs?
247to enable MSI-X messages on its hardware device. Depending on 33
248the availability of PCI vectors resources, the PCI subsystem enables 34There are three reasons why using MSIs can give an advantage over
249either all or none of the requested vectors. 35traditional pin-based interrupts.
36
37Pin-based PCI interrupts are often shared amongst several devices.
38To support this, the kernel must call each interrupt handler associated
39with an interrupt, which leads to reduced performance for the system as
40a whole. MSIs are never shared, so this problem cannot arise.
41
42When a device writes data to memory, then raises a pin-based interrupt,
43it is possible that the interrupt may arrive before all the data has
44arrived in memory (this becomes more likely with devices behind PCI-PCI
45bridges). In order to ensure that all the data has arrived in memory,
46the interrupt handler must read a register on the device which raised
47the interrupt. PCI transaction ordering rules require that all the data
48arrives in memory before the value can be returned from the register.
49Using MSIs avoids this problem as the interrupt-generating write cannot
50pass the data writes, so by the time the interrupt is raised, the driver
51knows that all the data has arrived in memory.
52
53PCI devices can only support a single pin-based interrupt per function.
54Often drivers have to query the device to find out what event has
55occurred, slowing down interrupt handling for the common case. With
56MSIs, a device can support more interrupts, allowing each interrupt
57to be specialised to a different purpose. One possible design gives
58infrequent conditions (such as errors) their own interrupt which allows
59the driver to handle the normal interrupt handling path more efficiently.
60Other possible designs include giving one interrupt to each packet queue
61in a network card or each port in a storage controller.
62
63
644. How to use MSIs
65
66PCI devices are initialised to use pin-based interrupts. The device
67driver has to set up the device to use MSI or MSI-X. Not all machines
68support MSIs correctly, and for those machines, the APIs described below
69will simply fail and the device will continue to use pin-based interrupts.
70
714.1 Include kernel support for MSIs
72
73To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
74option enabled. This option is only available on some architectures,
75and it may depend on some other options also being set. For example,
76on x86, you must also enable X86_UP_APIC or SMP in order to see the
77CONFIG_PCI_MSI option.
78
794.2 Using MSI
80
81Most of the hard work is done for the driver in the PCI layer. It simply
82has to request that the PCI layer set up the MSI capability for this
83device.
84
854.2.1 pci_enable_msi
86
87int pci_enable_msi(struct pci_dev *dev)
88
89A successful call will allocate ONE interrupt to the device, regardless
90of how many MSIs the device supports. The device will be switched from
91pin-based interrupt mode to MSI mode. The dev->irq number is changed
92to a new number which represents the message signaled interrupt.
93This function should be called before the driver calls request_irq()
94since enabling MSIs disables the pin-based IRQ and the driver will not
95receive interrupts on the old interrupt.
96
974.2.2 pci_enable_msi_block
98
99int pci_enable_msi_block(struct pci_dev *dev, int count)
100
101This variation on the above call allows a device driver to request multiple
102MSIs. The MSI specification only allows interrupts to be allocated in
103powers of two, up to a maximum of 2^5 (32).
104
105If this function returns 0, it has succeeded in allocating at least as many
106interrupts as the driver requested (it may have allocated more in order
107to satisfy the power-of-two requirement). In this case, the function
108enables MSI on this device and updates dev->irq to be the lowest of
109the new interrupts assigned to it. The other interrupts assigned to
110the device are in the range dev->irq to dev->irq + count - 1.
111
112If this function returns a negative number, it indicates an error and
113the driver should not attempt to request any more MSI interrupts for
114this device. If this function returns a positive number, it will be
115less than 'count' and indicate the number of interrupts that could have
116been allocated. In neither case will the irq value have been
117updated, nor will the device have been switched into MSI mode.
118
119The device driver must decide what action to take if
120pci_enable_msi_block() returns a value less than the number asked for.
121Some devices can make use of fewer interrupts than the maximum they
122request; in this case the driver should call pci_enable_msi_block()
123again. Note that it is not guaranteed to succeed, even when the
124'count' has been reduced to the value returned from a previous call to
125pci_enable_msi_block(). This is because there are multiple constraints
126on the number of vectors that can be allocated; pci_enable_msi_block()
127will return as soon as it finds any constraint that doesn't allow the
128call to succeed.
129
1304.2.3 pci_disable_msi
131
132void pci_disable_msi(struct pci_dev *dev)
250 133
251Argument 'dev' points to the device (pci_dev) structure. 134This function should be used to undo the effect of pci_enable_msi() or
135pci_enable_msi_block(). Calling it restores dev->irq to the pin-based
136interrupt number and frees the previously allocated message signaled
137interrupt(s). The interrupt may subsequently be assigned to another
138device, so drivers should not cache the value of dev->irq.
252 139
253Argument 'entries' is a pointer to an array of msix_entry structs. 140A device driver must always call free_irq() on the interrupt(s)
254The number of entries is indicated in argument 'nvec'. 141for which it has called request_irq() before calling this function.
255struct msix_entry is defined in /driver/pci/msi.h: 142Failure to do so will result in a BUG_ON(), the device will be left with
143MSI enabled and will leak its vector.
144
1454.3 Using MSI-X
146
147The MSI-X capability is much more flexible than the MSI capability.
148It supports up to 2048 interrupts, each of which can be controlled
149independently. To support this flexibility, drivers must use an array of
150`struct msix_entry':
256 151
257struct msix_entry { 152struct msix_entry {
258 u16 vector; /* kernel uses to write alloc vector */ 153 u16 vector; /* kernel uses to write alloc vector */
259 u16 entry; /* driver uses to specify entry */ 154 u16 entry; /* driver uses to specify entry */
260}; 155};
261 156
262A device driver is responsible for initializing the field 'entry' of 157This allows for the device to use these interrupts in a sparse fashion;
263each element with a unique entry supported by MSI-X table. Otherwise, 158for example it could use interrupts 3 and 1027 and allocate only a
264-EINVAL will be returned as a result. A successful return of zero 159two-element array. The driver is expected to fill in the 'entry' value
265indicates the PCI subsystem completed initializing each of the requested 160in each element of the array to indicate which entries it wants the kernel
266entries of the MSI-X table with message address and message data. 161to assign interrupts for. It is invalid to fill in two entries with the
267Last but not least, the PCI subsystem will write the 1:1 162same number.
268vector-to-entry mapping into the field 'vector' of each element. A 163
269device driver is responsible for keeping track of allocated MSI-X 1644.3.1 pci_enable_msix
270vectors in its internal data structure. 165
271 166int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
272A return of zero indicates that the number of MSI-X vectors was 167
273successfully allocated. A return of greater than zero indicates 168Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
274MSI-X vector shortage. Or a return of less than zero indicates 169The 'entries' argument is a pointer to an array of msix_entry structs
275a failure. This failure may be a result of duplicate entries 170which should be at least 'nvec' entries in size. On success, the
276specified in second argument, or a result of no available vector, 171function will return 0 and the device will have been switched into
277or a result of failing to initialize MSI-X table entries. 172MSI-X interrupt mode. The 'vector' elements in each entry will have
278 173been filled in with the interrupt number. The driver should then call
2795.3.3 API pci_disable_msix 174request_irq() for each 'vector' that it decides to use.
175
176If this function returns a negative number, it indicates an error and
177the driver should not attempt to allocate any more MSI-X interrupts for
178this device. If it returns a positive number, it indicates the maximum
179number of interrupt vectors that could have been allocated. See example
180below.
181
182This function, in contrast with pci_enable_msi(), does not adjust
183dev->irq. The device will not generate interrupts for this interrupt
184number once MSI-X is enabled. The device driver is responsible for
185keeping track of the interrupts assigned to the MSI-X vectors so it can
186free them again later.
187
188Device drivers should normally call this function once per device
189during the initialization phase.
190
191It is ideal if drivers can cope with a variable number of MSI-X interrupts,
192there are many reasons why the platform may not be able to provide the
193exact number a driver asks for.
194
195A request loop to achieve that might look like:
196
197static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
198{
199 while (nvec >= FOO_DRIVER_MINIMUM_NVEC) {
200 rc = pci_enable_msix(adapter->pdev,
201 adapter->msix_entries, nvec);
202 if (rc > 0)
203 nvec = rc;
204 else
205 return rc;
206 }
207
208 return -ENOSPC;
209}
210
2114.3.2 pci_disable_msix
280 212
281void pci_disable_msix(struct pci_dev *dev) 213void pci_disable_msix(struct pci_dev *dev)
282 214
283This API should always be used to undo the effect of pci_enable_msix() 215This API should be used to undo the effect of pci_enable_msix(). It frees
284when a device driver is unloading. Note that a device driver should 216the previously allocated message signaled interrupts. The interrupts may
285always call free_irq() on all MSI-X vectors it has done request_irq() 217subsequently be assigned to another device, so drivers should not cache
286on before calling this API. Failure to do so results in a BUG_ON() and 218the value of the 'vector' elements over a call to pci_disable_msix().
287a device will be left with MSI-X enabled and leaks its vectors. 219
288 220A device driver must always call free_irq() on the interrupt(s)
2895.3.4 MSI-X mode vs. legacy mode diagram 221for which it has called request_irq() before calling this function.
290 222Failure to do so will result in a BUG_ON(), the device will be left with
291The below diagram shows the events which switch the interrupt 223MSI enabled and will leak its vector.
292mode on the MSI-X capable device function between MSI-X mode and 224
293PIN-IRQ assertion mode (legacy). 2254.3.3 The MSI-X Table
294 226
295 ------------ pci_enable_msix(,,n) ------------------------ 227The MSI-X capability specifies a BAR and offset within that BAR for the
296 | | <=============== | | 228MSI-X Table. This address is mapped by the PCI subsystem, and should not
297 | MSI-X MODE | | PIN-IRQ ASSERTION MODE | 229be accessed directly by the device driver. If the driver wishes to
298 | | ===============> | | 230mask or unmask an interrupt, it should call disable_irq() / enable_irq().
299 ------------ pci_disable_msix ------------------------ 231
300 2324.4 Handling devices implementing both MSI and MSI-X capabilities
301Figure 2. MSI-X Mode vs. Legacy Mode 233
302 234If a device implements both MSI and MSI-X capabilities, it can
303In Figure 2, a device operates by default in legacy mode. A 235run in either MSI mode or MSI-X mode but not both simultaneously.
304successful MSI-X request (using pci_enable_msix()) switches a 236This is a requirement of the PCI spec, and it is enforced by the
305device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector 237PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or
306stored in dev->irq will be saved by the PCI subsystem; however, 238pci_enable_msix() when MSI is already enabled will result in an error.
307unlike MSI mode, the PCI subsystem will not replace dev->irq with 239If a device driver wishes to switch between MSI and MSI-X at runtime,
308assigned MSI-X vector because the PCI subsystem already writes the 1:1 240it must first quiesce the device, then switch it back to pin-interrupt
309vector-to-entry mapping into the field 'vector' of each element 241mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
310specified in second argument. 242operation. This is not expected to be a common operation but may be
311 243useful for debugging or testing during development.
312To return back to its default mode, a device driver should always call 244
313pci_disable_msix() to undo the effect of pci_enable_msix(). Note that 2454.5 Considerations when using MSIs
314a device driver should always call free_irq() on all MSI-X vectors it 246
315has done request_irq() on before calling pci_disable_msix(). Failure 2474.5.1 Choosing between MSI-X and MSI
316to do so results in a BUG_ON() and a device will be left with MSI-X 248
317enabled and leaks its vectors. Otherwise, the PCI subsystem switches a 249If your device supports both MSI-X and MSI capabilities, you should use
318device function's interrupt mode from MSI-X mode to legacy mode and 250the MSI-X facilities in preference to the MSI facilities. As mentioned
319marks all allocated MSI-X vectors as unused. 251above, MSI-X supports any number of interrupts between 1 and 2048.
320 252In constrast, MSI is restricted to a maximum of 32 interrupts (and
321Once being marked as unused, there is no guarantee that the PCI 253must be a power of two). In addition, the MSI interrupt vectors must
322subsystem will reserve these MSI-X vectors for a device. Depending on 254be allocated consecutively, so the system may not be able to allocate
323the availability of current PCI vector resources and the number of 255as many vectors for MSI as it could for MSI-X. On some platforms, MSI
324MSI/MSI-X requests from other drivers, these MSI-X vectors may be 256interrupts must all be targetted at the same set of CPUs whereas MSI-X
325re-assigned. 257interrupts can all be targetted at different CPUs.
326 258
327For the case where the PCI subsystem re-assigned these MSI-X vectors 2594.5.2 Spinlocks
328to other drivers, a request to switch back to MSI-X mode may result 260
329being assigned with another set of MSI-X vectors or a failure if no 261Most device drivers have a per-device spinlock which is taken in the
330more vectors are available. 262interrupt handler. With pin-based interrupts or a single MSI, it is not
331 263necessary to disable interrupts (Linux guarantees the same interrupt will
3325.4 Handling function implementing both MSI and MSI-X capabilities 264not be re-entered). If a device uses multiple interrupts, the driver
333 265must disable interrupts while the lock is held. If the device sends
334For the case where a function implements both MSI and MSI-X 266a different interrupt, the driver will deadlock trying to recursively
335capabilities, the PCI subsystem enables a device to run either in MSI 267acquire the spinlock.
336mode or MSI-X mode but not both. A device driver determines whether it 268
337wants MSI or MSI-X enabled on its hardware device. Once a device 269There are two solutions. The first is to take the lock with
338driver requests for MSI, for example, it is prohibited from requesting 270spin_lock_irqsave() or spin_lock_irq() (see
339MSI-X; in other words, a device driver is not permitted to ping-pong 271Documentation/DocBook/kernel-locking). The second is to specify
340between MSI mod MSI-X mode during a run-time. 272IRQF_DISABLED to request_irq() so that the kernel runs the entire
341 273interrupt routine with interrupts disabled.
3425.5 Hardware requirements for MSI/MSI-X support 274
343 275If your MSI interrupt routine does not hold the lock for the whole time
344MSI/MSI-X support requires support from both system hardware and 276it is running, the first solution may be best. The second solution is
345individual hardware device functions. 277normally preferred as it avoids making two transitions from interrupt
346 278disabled to enabled and back again.
3475.5.1 Required x86 hardware support 279
348 2804.6 How to tell whether MSI/MSI-X is enabled on a device
349Since the target of MSI address is the local APIC CPU, enabling 281
350MSI/MSI-X support in the Linux kernel is dependent on whether existing 282Using 'lspci -v' (as root) may show some devices with "MSI", "Message
351system hardware supports local APIC. Users should verify that their 283Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
352system supports local APIC operation by testing that it runs when 284has an 'Enable' flag which will be followed with either "+" (enabled)
353CONFIG_X86_LOCAL_APIC=y. 285or "-" (disabled).
354 286
355In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; 287
356however, in UP environment, users must manually set 2885. MSI quirks
357CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting 289
358CONFIG_PCI_MSI enables the VECTOR based scheme and the option for 290Several PCI chipsets or devices are known not to support MSIs.
359MSI-capable device drivers to selectively enable MSI/MSI-X. 291The PCI stack provides three ways to disable MSIs:
360 292
361Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X 2931. globally
362vector is allocated new during runtime and MSI/MSI-X support does not 2942. on all devices behind a specific bridge
363depend on BIOS support. This key independency enables MSI/MSI-X 2953. on a single device
364support on future IOxAPIC free platforms. 296
365 2975.1. Disabling MSIs globally
3665.5.2 Device hardware support 298
367 299Some host chipsets simply don't support MSIs properly. If we're
368The hardware device function supports MSI by indicating the 300lucky, the manufacturer knows this and has indicated it in the ACPI
369MSI/MSI-X capability structure on its PCI capability list. By 301FADT table. In this case, Linux will automatically disable MSIs.
370default, this capability structure will not be initialized by 302Some boards don't include this information in the table and so we have
371the kernel to enable MSI during the system boot. In other words, 303to detect them ourselves. The complete list of these is found near the
372the device function is running on its default pin assertion mode. 304quirk_disable_all_msi() function in drivers/pci/quirks.c.
373Note that in many cases the hardware supporting MSI have bugs, 305
374which may result in system hangs. The software driver of specific 306If you have a board which has problems with MSIs, you can pass pci=nomsi
375MSI-capable hardware is responsible for deciding whether to call 307on the kernel command line to disable MSIs on all devices. It would be
376pci_enable_msi or not. A return of zero indicates the kernel 308in your best interests to report the problem to linux-pci@vger.kernel.org
377successfully initialized the MSI/MSI-X capability structure of the 309including a full 'lspci -v' so we can add the quirks to the kernel.
378device function. The device function is now running on MSI/MSI-X mode. 310
379 3115.2. Disabling MSIs below a bridge
3805.6 How to tell whether MSI/MSI-X is enabled on device function 312
381 313Some PCI bridges are not able to route MSIs between busses properly.
382At the driver level, a return of zero from the function call of 314In this case, MSIs must be disabled on all devices behind the bridge.
383pci_enable_msi()/pci_enable_msix() indicates to a device driver that 315
384its device function is initialized successfully and ready to run in 316Some bridges allow you to enable MSIs by changing some bits in their
385MSI/MSI-X mode. 317PCI configuration space (especially the Hypertransport chipsets such
386 318as the nVidia nForce and Serverworks HT2000). As with host chipsets,
387At the user level, users can use the command 'cat /proc/interrupts' 319Linux mostly knows about them and automatically enables MSIs if it can.
388to display the vectors allocated for devices and their interrupt 320If you have a bridge which Linux doesn't yet know about, you can enable
389MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is 321MSIs in configuration space using whatever method you know works, then
390enabled on a SCSI Adaptec 39320D Ultra320 controller. 322enable MSIs on that bridge by doing:
391 323
392 CPU0 CPU1 324 echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
393 0: 324639 0 IO-APIC-edge timer 325
394 1: 1186 0 IO-APIC-edge i8042 326where $bridge is the PCI address of the bridge you've enabled (eg
395 2: 0 0 XT-PIC cascade 3270000:00:0e.0).
396 12: 2797 0 IO-APIC-edge i8042 328
397 14: 6543 0 IO-APIC-edge ide0 329To disable MSIs, echo 0 instead of 1. Changing this value should be
398 15: 1 0 IO-APIC-edge ide1 330done with caution as it can break interrupt handling for all devices
399169: 0 0 IO-APIC-level uhci-hcd 331below this bridge.
400185: 0 0 IO-APIC-level uhci-hcd 332
401193: 138 10 PCI-MSI aic79xx 333Again, please notify linux-pci@vger.kernel.org of any bridges that need
402201: 30 0 PCI-MSI aic79xx 334special handling.
403225: 30 0 IO-APIC-level aic7xxx 335
404233: 30 0 IO-APIC-level aic7xxx 3365.3. Disabling MSIs on a single device
405NMI: 0 0 337
406LOC: 324553 325068 338Some devices are known to have faulty MSI implementations. Usually this
407ERR: 0 339is handled in the individual device driver but occasionally it's necessary
408MIS: 0 340to handle this with a quirk. Some drivers have an option to disable use
409 341of MSI. While this is a convenient workaround for the driver author,
4106. MSI quirks 342it is not good practise, and should not be emulated.
411 343
412Several PCI chipsets or devices are known to not support MSI. 3445.4. Finding why MSIs are disabled on a device
413The PCI stack provides 3 possible levels of MSI disabling: 345
414* on a single device 346From the above three sections, you can see that there are many reasons
415* on all devices behind a specific bridge 347why MSIs may not be enabled for a given device. Your first step should
416* globally 348be to examine your dmesg carefully to determine whether MSIs are enabled
417 349for your machine. You should also check your .config to be sure you
4186.1. Disabling MSI on a single device 350have enabled CONFIG_PCI_MSI.
419 351
420Under some circumstances it might be required to disable MSI on a 352Then, 'lspci -t' gives the list of bridges above a device. Reading
421single device. This may be achieved by either not calling pci_enable_msi() 353/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1)
422or all, or setting the pci_dev->no_msi flag before (most of the time 354or disabled (0). If 0 is found in any of the msi_bus files belonging
423in a quirk). 355to bridges between the PCI root and the device, MSIs are disabled.
424 356
4256.2. Disabling MSI below a bridge 357It is also worth checking the device driver to see whether it supports MSIs.
426 358For example, it may contain calls to pci_enable_msi(), pci_enable_msix() or
427The vast majority of MSI quirks are required by PCI bridges not 359pci_enable_msi_block().
428being able to route MSI between busses. In this case, MSI have to be
429disabled on all devices behind this bridge. It is achieves by setting
430the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
431subordinate bus. There is no need to set the same flag on bridges that
432are below the broken bridge. When pci_enable_msi() is called to enable
433MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
434flag in all parent busses of the device.
435
436Some bridges actually support dynamic MSI support enabling/disabling
437by changing some bits in their PCI configuration space (especially
438the Hypertransport chipsets such as the nVidia nForce and Serverworks
439HT2000). It may then be required to update the NO_MSI flag on the
440corresponding devices in the sysfs hierarchy. To enable MSI support
441on device "0000:00:0e", do:
442
443 echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
444
445To disable MSI support, echo 0 instead of 1. Note that it should be
446used with caution since changing this value might break interrupts.
447
4486.3. Disabling MSI globally
449
450Some extreme cases may require to disable MSI globally on the system.
451For now, the only known case is a Serverworks PCI-X chipsets (MSI are
452not supported on several busses that are not all connected to the
453chipset in the Linux PCI hierarchy). In the vast majority of other
454cases, disabling only behind a specific bridge is enough.
455
456For debugging purpose, the user may also pass pci=nomsi on the kernel
457command-line to explicitly disable MSI globally. But, once the appro-
458priate quirks are added to the kernel, this option should not be
459required anymore.
460
4616.4. Finding why MSI cannot be enabled on a device
462
463Assuming that MSI are not enabled on a device, you should look at
464dmesg to find messages that quirks may output when disabling MSI
465on some devices, some bridges or even globally.
466Then, lspci -t gives the list of bridges above a device. Reading
467/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
468are enabled (1) or disabled (0). In 0 is found in a single bridge
469msi_bus file above the device, MSI cannot be enabled.
470
4717. FAQ
472
473Q1. Are there any limitations on using the MSI?
474
475A1. If the PCI device supports MSI and conforms to the
476specification and the platform supports the APIC local bus,
477then using MSI should work.
478
479Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
480AMD processors)? In P3 IPI's are transmitted on the APIC local
481bus and in P4 and Xeon they are transmitted on the system
482bus. Are there any implications with this?
483
484A2. MSI support enables a PCI device sending an inbound
485memory write (0xfeexxxxx as target address) on its PCI bus
486directly to the FSB. Since the message address has a
487redirection hint bit cleared, it should work.
488
489Q3. The target address 0xfeexxxxx will be translated by the
490Host Bridge into an interrupt message. Are there any
491limitations on the chipsets such as Intel 8xx, Intel e7xxx,
492or VIA?
493
494A3. If these chipsets support an inbound memory write with
495target address set as 0xfeexxxxx, as conformed to PCI
496specification 2.3 or latest, then it should work.
497
498Q4. From the driver point of view, if the MSI is lost because
499of errors occurring during inbound memory write, then it may
500wait forever. Is there a mechanism for it to recover?
501
502A4. Since the target of the transaction is an inbound memory
503write, all transaction termination conditions (Retry,
504Master-Abort, Target-Abort, or normal completion) are
505supported. A device sending an MSI must abide by all the PCI
506rules and conditions regarding that inbound memory write. So,
507if a retry is signaled it must retry, etc... We believe that
508the recommendation for Abort is also a retry (refer to PCI
509specification 2.3 or latest).
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
new file mode 100644
index 000000000000..fc73ef5d65b8
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.txt
@@ -0,0 +1,99 @@
1 PCI Express I/O Virtualization Howto
2 Copyright (C) 2009 Intel Corporation
3 Yu Zhao <yu.zhao@intel.com>
4
5
61. Overview
7
81.1 What is SR-IOV
9
10Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
11capability which makes one physical device appear as multiple virtual
12devices. The physical device is referred to as Physical Function (PF)
13while the virtual devices are referred to as Virtual Functions (VF).
14Allocation of the VF can be dynamically controlled by the PF via
15registers encapsulated in the capability. By default, this feature is
16not enabled and the PF behaves as traditional PCIe device. Once it's
17turned on, each VF's PCI configuration space can be accessed by its own
18Bus, Device and Function Number (Routing ID). And each VF also has PCI
19Memory Space, which is used to map its register set. VF device driver
20operates on the register set so it can be functional and appear as a
21real existing PCI device.
22
232. User Guide
24
252.1 How can I enable SR-IOV capability
26
27The device driver (PF driver) will control the enabling and disabling
28of the capability via API provided by SR-IOV core. If the hardware
29has SR-IOV capability, loading its PF driver would enable it and all
30VFs associated with the PF.
31
322.2 How can I use the Virtual Functions
33
34The VF is treated as hot-plugged PCI devices in the kernel, so they
35should be able to work in the same way as real PCI devices. The VF
36requires device driver that is same as a normal PCI device's.
37
383. Developer Guide
39
403.1 SR-IOV API
41
42To enable SR-IOV capability:
43 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
44 'nr_virtfn' is number of VFs to be enabled.
45
46To disable SR-IOV capability:
47 void pci_disable_sriov(struct pci_dev *dev);
48
49To notify SR-IOV core of Virtual Function Migration:
50 irqreturn_t pci_sriov_migration(struct pci_dev *dev);
51
523.2 Usage example
53
54Following piece of code illustrates the usage of the SR-IOV API.
55
56static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
57{
58 pci_enable_sriov(dev, NR_VIRTFN);
59
60 ...
61
62 return 0;
63}
64
65static void __devexit dev_remove(struct pci_dev *dev)
66{
67 pci_disable_sriov(dev);
68
69 ...
70}
71
72static int dev_suspend(struct pci_dev *dev, pm_message_t state)
73{
74 ...
75
76 return 0;
77}
78
79static int dev_resume(struct pci_dev *dev)
80{
81 ...
82
83 return 0;
84}
85
86static void dev_shutdown(struct pci_dev *dev)
87{
88 ...
89}
90
91static struct pci_driver dev_driver = {
92 .name = "SR-IOV Physical Function driver",
93 .id_table = dev_id_table,
94 .probe = dev_probe,
95 .remove = __devexit_p(dev_remove),
96 .suspend = dev_suspend,
97 .resume = dev_resume,
98 .shutdown = dev_shutdown,
99};
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 6e253407b3dc..accfe2f5247d 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -298,3 +298,15 @@ over a rather long period of time, but improvements are always welcome!
298 298
299 Note that, rcu_assign_pointer() and rcu_dereference() relate to 299 Note that, rcu_assign_pointer() and rcu_dereference() relate to
300 SRCU just as they do to other forms of RCU. 300 SRCU just as they do to other forms of RCU.
301
30215. The whole point of call_rcu(), synchronize_rcu(), and friends
303 is to wait until all pre-existing readers have finished before
304 carrying out some otherwise-destructive operation. It is
305 therefore critically important to -first- remove any path
306 that readers can follow that could be affected by the
307 destructive operation, and -only- -then- invoke call_rcu(),
308 synchronize_rcu(), or friends.
309
310 Because these primitives only wait for pre-existing readers,
311 it is the caller's responsibility to guarantee safety to
312 any subsequent readers.
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 1fd175368a87..4349c1487e91 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -118,7 +118,7 @@ Following are the RCU equivalents for these two functions:
118 list_for_each_entry(e, list, list) { 118 list_for_each_entry(e, list, list) {
119 if (!audit_compare_rule(rule, &e->rule)) { 119 if (!audit_compare_rule(rule, &e->rule)) {
120 list_del_rcu(&e->list); 120 list_del_rcu(&e->list);
121 call_rcu(&e->rcu, audit_free_rule, e); 121 call_rcu(&e->rcu, audit_free_rule);
122 return 0; 122 return 0;
123 } 123 }
124 } 124 }
@@ -206,7 +206,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
206 ne->rule.action = newaction; 206 ne->rule.action = newaction;
207 ne->rule.file_count = newfield_count; 207 ne->rule.file_count = newfield_count;
208 list_replace_rcu(e, ne); 208 list_replace_rcu(e, ne);
209 call_rcu(&e->rcu, audit_free_rule, e); 209 call_rcu(&e->rcu, audit_free_rule);
210 return 0; 210 return 0;
211 } 211 }
212 } 212 }
@@ -283,7 +283,7 @@ flag under the spinlock as follows:
283 list_del_rcu(&e->list); 283 list_del_rcu(&e->list);
284 e->deleted = 1; 284 e->deleted = 1;
285 spin_unlock(&e->lock); 285 spin_unlock(&e->lock);
286 call_rcu(&e->rcu, audit_free_rule, e); 286 call_rcu(&e->rcu, audit_free_rule);
287 return 0; 287 return 0;
288 } 288 }
289 } 289 }
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 95821a29ae41..7aa2002ade77 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -81,7 +81,7 @@ o I hear that RCU needs work in order to support realtime kernels?
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
83 However, work is in progress for enabling priority boosting of 83 However, work is in progress for enabling priority boosting of
84 preempted RCU read-side critical sections.This is needed if you 84 preempted RCU read-side critical sections. This is needed if you
85 have CPU-bound realtime threads. 85 have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 239f542d48ba..6389dec33459 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -21,7 +21,7 @@ if (obj) {
21 /* 21 /*
22 * Because a writer could delete object, and a writer could 22 * Because a writer could delete object, and a writer could
23 * reuse these object before the RCU grace period, we 23 * reuse these object before the RCU grace period, we
24 * must check key after geting the reference on object 24 * must check key after getting the reference on object
25 */ 25 */
26 if (obj->key != key) { // not the object we expected 26 if (obj->key != key) { // not the object we expected
27 put_ref(obj); 27 put_ref(obj);
@@ -117,7 +117,7 @@ a race (some writer did a delete and/or a move of an object
117to another chain) checking the final 'nulls' value if 117to another chain) checking the final 'nulls' value if
118the lookup met the end of chain. If final 'nulls' value 118the lookup met the end of chain. If final 'nulls' value
119is not the slot number, then we must restart the lookup at 119is not the slot number, then we must restart the lookup at
120the begining. If the object was moved to same chain, 120the beginning. If the object was moved to the same chain,
121then the reader doesnt care : It might eventually 121then the reader doesnt care : It might eventually
122scan the list again without harm. 122scan the list again without harm.
123 123
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 989c2fcd8111..629c92e99783 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,14 +184,16 @@ length. Single character labels using special characters, that being anything
184other than a letter or digit, are reserved for use by the Smack development 184other than a letter or digit, are reserved for use by the Smack development
185team. Smack labels are unstructured, case sensitive, and the only operation 185team. Smack labels are unstructured, case sensitive, and the only operation
186ever performed on them is comparison for equality. Smack labels cannot 186ever performed on them is comparison for equality. Smack labels cannot
187contain unprintable characters or the "/" (slash) character. 187contain unprintable characters or the "/" (slash) character. Smack labels
188cannot begin with a '-', which is reserved for special options.
188 189
189There are some predefined labels: 190There are some predefined labels:
190 191
191 _ Pronounced "floor", a single underscore character. 192 _ Pronounced "floor", a single underscore character.
192 ^ Pronounced "hat", a single circumflex character. 193 ^ Pronounced "hat", a single circumflex character.
193 * Pronounced "star", a single asterisk character. 194 * Pronounced "star", a single asterisk character.
194 ? Pronounced "huh", a single question mark character. 195 ? Pronounced "huh", a single question mark character.
196 @ Pronounced "Internet", a single at sign character.
195 197
196Every task on a Smack system is assigned a label. System tasks, such as 198Every task on a Smack system is assigned a label. System tasks, such as
197init(8) and systems daemons, are run with the floor ("_") label. User tasks 199init(8) and systems daemons, are run with the floor ("_") label. User tasks
@@ -412,6 +414,36 @@ sockets.
412 A privileged program may set this to match the label of another 414 A privileged program may set this to match the label of another
413 task with which it hopes to communicate. 415 task with which it hopes to communicate.
414 416
417Smack Netlabel Exceptions
418
419You will often find that your labeled application has to talk to the outside,
420unlabeled world. To do this there's a special file /smack/netlabel where you can
421add some exceptions in the form of :
422@IP1 LABEL1 or
423@IP2/MASK LABEL2
424
425It means that your application will have unlabeled access to @IP1 if it has
426write access on LABEL1, and access to the subnet @IP2/MASK if it has write
427access on LABEL2.
428
429Entries in the /smack/netlabel file are matched by longest mask first, like in
430classless IPv4 routing.
431
432A special label '@' and an option '-CIPSO' can be used there :
433@ means Internet, any application with any label has access to it
434-CIPSO means standard CIPSO networking
435
436If you don't know what CIPSO is and don't plan to use it, you can just do :
437echo 127.0.0.1 -CIPSO > /smack/netlabel
438echo 0.0.0.0/0 @ > /smack/netlabel
439
440If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled
441Internet access, you can have :
442echo 127.0.0.1 -CIPSO > /smack/netlabel
443echo 192.168.0.0/16 -CIPSO > /smack/netlabel
444echo 0.0.0.0/0 @ > /smack/netlabel
445
446
415Writing Applications for Smack 447Writing Applications for Smack
416 448
417There are three sorts of applications that will run on a Smack system. How an 449There are three sorts of applications that will run on a Smack system. How an
diff --git a/Documentation/arm/Samsung-S3C24XX/Suspend.txt b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
index 0dab6e32c130..a30fe510572b 100644
--- a/Documentation/arm/Samsung-S3C24XX/Suspend.txt
+++ b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
@@ -40,13 +40,13 @@ Resuming
40Machine Support 40Machine Support
41--------------- 41---------------
42 42
43 The machine specific functions must call the s3c2410_pm_init() function 43 The machine specific functions must call the s3c_pm_init() function
44 to say that its bootloader is capable of resuming. This can be as 44 to say that its bootloader is capable of resuming. This can be as
45 simple as adding the following to the machine's definition: 45 simple as adding the following to the machine's definition:
46 46
47 INITMACHINE(s3c2410_pm_init) 47 INITMACHINE(s3c_pm_init)
48 48
49 A board can do its own setup before calling s3c2410_pm_init, if it 49 A board can do its own setup before calling s3c_pm_init, if it
50 needs to setup anything else for power management support. 50 needs to setup anything else for power management support.
51 51
52 There is currently no support for over-riding the default method of 52 There is currently no support for over-riding the default method of
@@ -74,7 +74,7 @@ statuc void __init machine_init(void)
74 74
75 enable_irq_wake(IRQ_EINT0); 75 enable_irq_wake(IRQ_EINT0);
76 76
77 s3c2410_pm_init(); 77 s3c_pm_init();
78} 78}
79 79
80 80
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index dc6045577a8b..43cb1004d35f 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -29,7 +29,14 @@ ffff0000 ffff0fff CPU vector page.
29 CPU supports vector relocation (control 29 CPU supports vector relocation (control
30 register V bit.) 30 register V bit.)
31 31
32ffc00000 fffeffff DMA memory mapping region. Memory returned 32fffe0000 fffeffff XScale cache flush area. This is used
33 in proc-xscale.S to flush the whole data
34 cache. Free for other usage on non-XScale.
35
36fff00000 fffdffff Fixmap mapping region. Addresses provided
37 by fix_to_virt() will be located here.
38
39ffc00000 ffefffff DMA memory mapping region. Memory returned
33 by the dma_alloc_xxx functions will be 40 by the dma_alloc_xxx functions will be
34 dynamically mapped here. 41 dynamically mapped here.
35 42
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index ecad6ee75705..6fab97ea7e6b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1040,23 +1040,21 @@ Front merges are handled by the binary trees in AS and deadline schedulers.
1040iii. Plugging the queue to batch requests in anticipation of opportunities for 1040iii. Plugging the queue to batch requests in anticipation of opportunities for
1041 merge/sort optimizations 1041 merge/sort optimizations
1042 1042
1043This is just the same as in 2.4 so far, though per-device unplugging
1044support is anticipated for 2.5. Also with a priority-based i/o scheduler,
1045such decisions could be based on request priorities.
1046
1047Plugging is an approach that the current i/o scheduling algorithm resorts to so 1043Plugging is an approach that the current i/o scheduling algorithm resorts to so
1048that it collects up enough requests in the queue to be able to take 1044that it collects up enough requests in the queue to be able to take
1049advantage of the sorting/merging logic in the elevator. If the 1045advantage of the sorting/merging logic in the elevator. If the
1050queue is empty when a request comes in, then it plugs the request queue 1046queue is empty when a request comes in, then it plugs the request queue
1051(sort of like plugging the bottom of a vessel to get fluid to build up) 1047(sort of like plugging the bath tub of a vessel to get fluid to build up)
1052till it fills up with a few more requests, before starting to service 1048till it fills up with a few more requests, before starting to service
1053the requests. This provides an opportunity to merge/sort the requests before 1049the requests. This provides an opportunity to merge/sort the requests before
1054passing them down to the device. There are various conditions when the queue is 1050passing them down to the device. There are various conditions when the queue is
1055unplugged (to open up the flow again), either through a scheduled task or 1051unplugged (to open up the flow again), either through a scheduled task or
1056could be on demand. For example wait_on_buffer sets the unplugging going 1052could be on demand. For example wait_on_buffer sets the unplugging going
1057(by running tq_disk) so the read gets satisfied soon. So in the read case, 1053through sync_buffer() running blk_run_address_space(mapping). Or the caller
1058the queue gets explicitly unplugged as part of waiting for completion, 1054can do it explicity through blk_unplug(bdev). So in the read case,
1059in fact all queues get unplugged as a side-effect. 1055the queue gets explicitly unplugged as part of waiting for completion on that
1056buffer. For page driven IO, the address space ->sync_page() takes care of
1057doing the blk_run_address_space().
1060 1058
1061Aside: 1059Aside:
1062 This is kind of controversial territory, as it's not clear if plugging is 1060 This is kind of controversial territory, as it's not clear if plugging is
@@ -1067,11 +1065,6 @@ Aside:
1067 multi-page bios being queued in one shot, we may not need to wait to merge 1065 multi-page bios being queued in one shot, we may not need to wait to merge
1068 a big request from the broken up pieces coming by. 1066 a big request from the broken up pieces coming by.
1069 1067
1070 Per-queue granularity unplugging (still a Todo) may help reduce some of the
1071 concerns with just a single tq_disk flush approach. Something like
1072 blk_kick_queue() to unplug a specific queue (right away ?)
1073 or optionally, all queues, is in the plan.
1074
10754.4 I/O contexts 10684.4 I/O contexts
1076I/O contexts provide a dynamically allocated per process data area. They may 1069I/O contexts provide a dynamically allocated per process data area. They may
1077be used in I/O schedulers, and in the block layer (could be used for IO statis, 1070be used in I/O schedulers, and in the block layer (could be used for IO statis,
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
index 634c952e1964..d5af3f630814 100644
--- a/Documentation/block/switching-sched.txt
+++ b/Documentation/block/switching-sched.txt
@@ -35,9 +35,3 @@ noop anticipatory deadline [cfq]
35# echo anticipatory > /sys/block/hda/queue/scheduler 35# echo anticipatory > /sys/block/hda/queue/scheduler
36# cat /sys/block/hda/queue/scheduler 36# cat /sys/block/hda/queue/scheduler
37noop [anticipatory] deadline cfq 37noop [anticipatory] deadline cfq
38
39Each io queue has a set of io scheduler tunables associated with it. These
40tunables control how the io scheduler works. You can find these entries
41in:
42
43/sys/block/<device>/queue/iosched
diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
index 86f054c47013..c08df56dd91b 100644
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -8,6 +8,8 @@ cpqarray.txt
8 - info on using Compaq's SMART2 Intelligent Disk Array Controllers. 8 - info on using Compaq's SMART2 Intelligent Disk Array Controllers.
9floppy.txt 9floppy.txt
10 - notes and driver options for the floppy disk driver. 10 - notes and driver options for the floppy disk driver.
11mflash.txt
12 - info on mGine m(g)flash driver for linux.
11nbd.txt 13nbd.txt
12 - info on a TCP implementation of a network block device. 14 - info on a TCP implementation of a network block device.
13paride.txt 15paride.txt
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
new file mode 100644
index 000000000000..1f610ecf698a
--- /dev/null
+++ b/Documentation/blockdev/mflash.txt
@@ -0,0 +1,84 @@
1This document describes m[g]flash support in linux.
2
3Contents
4 1. Overview
5 2. Reserved area configuration
6 3. Example of mflash platform driver registration
7
81. Overview
9
10Mflash and gflash are embedded flash drive. The only difference is mflash is
11MCP(Multi Chip Package) device. These two device operate exactly same way.
12So the rest mflash repersents mflash and gflash altogether.
13
14Internally, mflash has nand flash and other hardware logics and supports
152 different operation (ATA, IO) modes. ATA mode doesn't need any new
16driver and currently works well under standard IDE subsystem. Actually it's
17one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
18IDE interface.
19
20Followings are brief descriptions about IO mode.
21A. IO mode based on ATA protocol and uses some custom command. (read confirm,
22write confirm)
23B. IO mode uses SRAM bus interface.
24C. IO mode supports 4kB boot area, so host can boot from mflash.
25
262. Reserved area configuration
27If host boot from mflash, usually needs raw area for boot loader image. All of
28the mflash's block device operation will be taken this value as start offset.
29Note that boot loader's size of reserved area and kernel configuration value
30must be same.
31
323. Example of mflash platform driver registration
33Working mflash is very straight forward. Adding platform device stuff to board
34configuration file is all. Here is some pseudo example.
35
36static struct mg_drv_data mflash_drv_data = {
37 /* If you want to polling driver set to 1 */
38 .use_polling = 0,
39 /* device attribution */
40 .dev_attr = MG_BOOT_DEV
41};
42
43static struct resource mg_mflash_rsc[] = {
44 /* Base address of mflash */
45 [0] = {
46 .start = 0x08000000,
47 .end = 0x08000000 + SZ_64K - 1,
48 .flags = IORESOURCE_MEM
49 },
50 /* mflash interrupt pin */
51 [1] = {
52 .start = IRQ_GPIO(84),
53 .end = IRQ_GPIO(84),
54 .flags = IORESOURCE_IRQ
55 },
56 /* mflash reset pin */
57 [2] = {
58 .start = 43,
59 .end = 43,
60 .name = MG_RST_PIN,
61 .flags = IORESOURCE_IO
62 },
63 /* mflash reset-out pin
64 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
65 * should assign this */
66 [3] = {
67 .start = 51,
68 .end = 51,
69 .name = MG_RSTOUT_PIN,
70 .flags = IORESOURCE_IO
71 }
72};
73
74static struct platform_device mflash_dev = {
75 .name = MG_DEV_NAME,
76 .id = -1,
77 .dev = {
78 .platform_data = &mflash_drv_data,
79 },
80 .num_resources = ARRAY_SIZE(mg_mflash_rsc),
81 .resource = mg_mflash_rsc
82};
83
84platform_device_register(&mflash_dev);
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
new file mode 100644
index 000000000000..3f58fa3d6d00
--- /dev/null
+++ b/Documentation/cgroups/00-INDEX
@@ -0,0 +1,18 @@
100-INDEX
2 - this file
3cgroups.txt
4 - Control Groups definition, implementation details, examples and API.
5cpuacct.txt
6 - CPU Accounting Controller; account CPU usage for groups of tasks.
7cpusets.txt
8 - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
9devices.txt
10 - Device Whitelist Controller; description, interface and security.
11freezer-subsystem.txt
12 - checkpointing; rationale to not use signals, interface.
13memcg_test.txt
14 - Memory Resource Controller; implementation details.
15memory.txt
16 - Memory Resource Controller; design, accounting, interface, testing.
17resource_counter.txt
18 - Resource Counter API.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..6eb1a97e88ce 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -56,7 +56,7 @@ hierarchy, and a set of subsystems; each subsystem has system-specific
56state attached to each cgroup in the hierarchy. Each hierarchy has 56state attached to each cgroup in the hierarchy. Each hierarchy has
57an instance of the cgroup virtual filesystem associated with it. 57an instance of the cgroup virtual filesystem associated with it.
58 58
59At any one time there may be multiple active hierachies of task 59At any one time there may be multiple active hierarchies of task
60cgroups. Each hierarchy is a partition of all tasks in the system. 60cgroups. Each hierarchy is a partition of all tasks in the system.
61 61
62User level code may create and destroy cgroups by name in an 62User level code may create and destroy cgroups by name in an
@@ -124,10 +124,10 @@ following lines:
124 / \ 124 / \
125 Prof (15%) students (5%) 125 Prof (15%) students (5%)
126 126
127Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go 127Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd go
128into NFS network class. 128into NFS network class.
129 129
130At the same time firefox/lynx will share an appropriate CPU/Memory class 130At the same time Firefox/Lynx will share an appropriate CPU/Memory class
131depending on who launched it (prof/student). 131depending on who launched it (prof/student).
132 132
133With the ability to classify tasks differently for different resources 133With the ability to classify tasks differently for different resources
@@ -325,7 +325,7 @@ and then start a subshell 'sh' in that cgroup:
325Creating, modifying, using the cgroups can be done through the cgroup 325Creating, modifying, using the cgroups can be done through the cgroup
326virtual filesystem. 326virtual filesystem.
327 327
328To mount a cgroup hierarchy will all available subsystems, type: 328To mount a cgroup hierarchy with all available subsystems, type:
329# mount -t cgroup xxx /dev/cgroup 329# mount -t cgroup xxx /dev/cgroup
330 330
331The "xxx" is not interpreted by the cgroup code, but will appear in 331The "xxx" is not interpreted by the cgroup code, but will appear in
@@ -333,12 +333,23 @@ The "xxx" is not interpreted by the cgroup code, but will appear in
333 333
334To mount a cgroup hierarchy with just the cpuset and numtasks 334To mount a cgroup hierarchy with just the cpuset and numtasks
335subsystems, type: 335subsystems, type:
336# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup 336# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup
337 337
338To change the set of subsystems bound to a mounted hierarchy, just 338To change the set of subsystems bound to a mounted hierarchy, just
339remount with different options: 339remount with different options:
340# mount -o remount,cpuset,ns hier1 /dev/cgroup
340 341
341# mount -o remount,cpuset,ns /dev/cgroup 342Now memory is removed from the hierarchy and ns is added.
343
344Note this will add ns to the hierarchy but won't remove memory or
345cpuset, because the new options are appended to the old ones:
346# mount -o remount,ns /dev/cgroup
347
348To Specify a hierarchy's release_agent:
349# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
350 xxx /dev/cgroup
351
352Note that specifying 'release_agent' more than once will return failure.
342 353
343Note that changing the set of subsystems is currently only supported 354Note that changing the set of subsystems is currently only supported
344when the hierarchy consists of a single (root) cgroup. Supporting 355when the hierarchy consists of a single (root) cgroup. Supporting
@@ -349,6 +360,11 @@ Then under /dev/cgroup you can find a tree that corresponds to the
349tree of the cgroups in the system. For instance, /dev/cgroup 360tree of the cgroups in the system. For instance, /dev/cgroup
350is the cgroup that holds the whole system. 361is the cgroup that holds the whole system.
351 362
363If you want to change the value of release_agent:
364# echo "/sbin/new_release_agent" > /dev/cgroup/release_agent
365
366It can also be changed via remount.
367
352If you want to create a new cgroup under /dev/cgroup: 368If you want to create a new cgroup under /dev/cgroup:
353# cd /dev/cgroup 369# cd /dev/cgroup
354# mkdir my_cgroup 370# mkdir my_cgroup
@@ -476,11 +492,13 @@ cgroup->parent is still valid. (Note - can also be called for a
476newly-created cgroup if an error occurs after this subsystem's 492newly-created cgroup if an error occurs after this subsystem's
477create() method has been called for the new cgroup). 493create() method has been called for the new cgroup).
478 494
479void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); 495int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
480 496
481Called before checking the reference count on each subsystem. This may 497Called before checking the reference count on each subsystem. This may
482be useful for subsystems which have some extra references even if 498be useful for subsystems which have some extra references even if
483there are not tasks in the cgroup. 499there are not tasks in the cgroup. If pre_destroy() returns error code,
500rmdir() will fail with it. From this behavior, pre_destroy() can be
501called multiple times against a cgroup.
484 502
485int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 503int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
486 struct task_struct *task) 504 struct task_struct *task)
@@ -521,7 +539,7 @@ always handled well.
521void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) 539void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
522(cgroup_mutex held by caller) 540(cgroup_mutex held by caller)
523 541
524Called at the end of cgroup_clone() to do any paramater 542Called at the end of cgroup_clone() to do any parameter
525initialization which might be required before a task could attach. For 543initialization which might be required before a task could attach. For
526example in cpusets, no task may attach before 'cpus' and 'mems' are set 544example in cpusets, no task may attach before 'cpus' and 'mems' are set
527up. 545up.
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index bb775fbe43d7..8b930946c52a 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children 30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in 31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also. 32/cgroups/cpuacct.usage also.
33
34cpuacct.stat file lists a few statistics which further divide the
35CPU time obtained by the cgroup into user and system times. Currently
36the following statistics are supported:
37
38user: Time spent by tasks of the cgroup in user mode.
39system: Time spent by tasks of the cgroup in kernel mode.
40
41user and system are in USER_HZ unit.
42
43cpuacct controller uses percpu_counter interface to collect user and
44system times. This has two side effects:
45
46- It is theoretically possible to see wrong values for user and system times.
47 This is because percpu_counter_read() on 32bit systems isn't safe
48 against concurrent writes.
49- It is possible to see slightly outdated values for user and system times
50 due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 0611e9528c7c..f9ca389dddf4 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -131,7 +131,7 @@ Cpusets extends these two mechanisms as follows:
131 - The hierarchy of cpusets can be mounted at /dev/cpuset, for 131 - The hierarchy of cpusets can be mounted at /dev/cpuset, for
132 browsing and manipulation from user space. 132 browsing and manipulation from user space.
133 - A cpuset may be marked exclusive, which ensures that no other 133 - A cpuset may be marked exclusive, which ensures that no other
134 cpuset (except direct ancestors and descendents) may contain 134 cpuset (except direct ancestors and descendants) may contain
135 any overlapping CPUs or Memory Nodes. 135 any overlapping CPUs or Memory Nodes.
136 - You can list all the tasks (by pid) attached to any cpuset. 136 - You can list all the tasks (by pid) attached to any cpuset.
137 137
@@ -226,7 +226,7 @@ nodes with memory--using the cpuset_track_online_nodes() hook.
226-------------------------------- 226--------------------------------
227 227
228If a cpuset is cpu or mem exclusive, no other cpuset, other than 228If a cpuset is cpu or mem exclusive, no other cpuset, other than
229a direct ancestor or descendent, may share any of the same CPUs or 229a direct ancestor or descendant, may share any of the same CPUs or
230Memory Nodes. 230Memory Nodes.
231 231
232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", 232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
@@ -427,7 +427,7 @@ child cpusets have this flag enabled.
427When doing this, you don't usually want to leave any unpinned tasks in 427When doing this, you don't usually want to leave any unpinned tasks in
428the top cpuset that might use non-trivial amounts of CPU, as such tasks 428the top cpuset that might use non-trivial amounts of CPU, as such tasks
429may be artificially constrained to some subset of CPUs, depending on 429may be artificially constrained to some subset of CPUs, depending on
430the particulars of this flag setting in descendent cpusets. Even if 430the particulars of this flag setting in descendant cpusets. Even if
431such a task could use spare CPU cycles in some other CPUs, the kernel 431such a task could use spare CPU cycles in some other CPUs, the kernel
432scheduler might not consider the possibility of load balancing that 432scheduler might not consider the possibility of load balancing that
433task to that underused CPU. 433task to that underused CPU.
@@ -531,9 +531,9 @@ be idle.
531 531
532Of course it takes some searching cost to find movable tasks and/or 532Of course it takes some searching cost to find movable tasks and/or
533idle CPUs, the scheduler might not search all CPUs in the domain 533idle CPUs, the scheduler might not search all CPUs in the domain
534everytime. In fact, in some architectures, the searching ranges on 534every time. In fact, in some architectures, the searching ranges on
535events are limited in the same socket or node where the CPU locates, 535events are limited in the same socket or node where the CPU locates,
536while the load balance on tick searchs all. 536while the load balance on tick searches all.
537 537
538For example, assume CPU Z is relatively far from CPU X. Even if CPU Z 538For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
539is idle while CPU X and the siblings are busy, scheduler can't migrate 539is idle while CPU X and the siblings are busy, scheduler can't migrate
@@ -601,7 +601,7 @@ its new cpuset, then the task will continue to use whatever subset
601of MPOL_BIND nodes are still allowed in the new cpuset. If the task 601of MPOL_BIND nodes are still allowed in the new cpuset. If the task
602was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed 602was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
603in the new cpuset, then the task will be essentially treated as if it 603in the new cpuset, then the task will be essentially treated as if it
604was MPOL_BIND bound to the new cpuset (even though its numa placement, 604was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
605as queried by get_mempolicy(), doesn't change). If a task is moved 605as queried by get_mempolicy(), doesn't change). If a task is moved
606from one cpuset to another, then the kernel will adjust the tasks 606from one cpuset to another, then the kernel will adjust the tasks
607memory placement, as above, the next time that the kernel attempts 607memory placement, as above, the next time that the kernel attempts
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt
index 7cc6e6a60672..57ca4c89fe5c 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroups/devices.txt
@@ -42,7 +42,7 @@ suffice, but we can decide the best way to adequately restrict
42movement as people get some experience with this. We may just want 42movement as people get some experience with this. We may just want
43to require CAP_SYS_ADMIN, which at least is a separate bit from 43to require CAP_SYS_ADMIN, which at least is a separate bit from
44CAP_MKNOD. We may want to just refuse moving to a cgroup which 44CAP_MKNOD. We may want to just refuse moving to a cgroup which
45isn't a descendent of the current one. Or we may want to use 45isn't a descendant of the current one. Or we may want to use
46CAP_MAC_ADMIN, since we really are trying to lock down root. 46CAP_MAC_ADMIN, since we really are trying to lock down root.
47 47
48CAP_SYS_ADMIN is needed to modify the whitelist or move another 48CAP_SYS_ADMIN is needed to modify the whitelist or move another
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..72db89ed0609 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/19 2Last Updated: 2009/1/20
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.29-rc2.
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -356,7 +356,25 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
356 (Shell-B) 356 (Shell-B)
357 # move all tasks in /cgroup/test to /cgroup 357 # move all tasks in /cgroup/test to /cgroup
358 # /sbin/swapoff -a 358 # /sbin/swapoff -a
359 # rmdir /test/cgroup 359 # rmdir /cgroup/test
360 # kill malloc task. 360 # kill malloc task.
361 361
362 Of course, tmpfs v.s. swapoff test should be tested, too. 362 Of course, tmpfs v.s. swapoff test should be tested, too.
363
364 9.8 OOM-Killer
365 Out-of-memory caused by memcg's limit will kill tasks under
366 the memcg. When hierarchy is used, a task under hierarchy
367 will be killed by the kernel.
368 In this case, panic_on_oom shouldn't be invoked and tasks
369 in other groups shouldn't be killed.
370
371 It's not difficult to cause OOM under memcg as following.
372 Case A) when you can swapoff
373 #swapoff -a
374 #echo 50M > /memory.limit_in_bytes
375 run 51M of malloc
376
377 Case B) when you use mem+swap limitation.
378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e1501964df1e..1a608877b14e 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware.
6 6
7Salient features 7Salient features
8 8
9a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages 9a. Enable control of Anonymous, Page Cache (mapped and unmapped) and
10 Swap Cache memory pages.
10b. The infrastructure allows easy addition of other types of memory to control 11b. The infrastructure allows easy addition of other types of memory to control
11c. Provides *zero overhead* for non memory controller users 12c. Provides *zero overhead* for non memory controller users
12d. Provides a double LRU: global memory pressure causes reclaim from the 13d. Provides a double LRU: global memory pressure causes reclaim from the
13 global LRU; a cgroup on hitting a limit, reclaims from the per 14 global LRU; a cgroup on hitting a limit, reclaims from the per
14 cgroup LRU 15 cgroup LRU
15 16
16NOTE: Swap Cache (unmapped) is not accounted now.
17
18Benefits and Purpose of the memory controller 17Benefits and Purpose of the memory controller
19 18
20The memory controller isolates the memory behaviour of a group of tasks 19The memory controller isolates the memory behaviour of a group of tasks
@@ -290,34 +289,44 @@ will be charged as a new owner of it.
290 moved to the parent. If you want to avoid that, force_empty will be useful. 289 moved to the parent. If you want to avoid that, force_empty will be useful.
291 290
2925.2 stat file 2915.2 stat file
293 memory.stat file includes following statistics (now) 292
294 cache - # of pages from page-cache and shmem. 293memory.stat file includes following statistics
295 rss - # of pages from anonymous memory. 294
296 pgpgin - # of event of charging 295cache - # of bytes of page cache memory.
297 pgpgout - # of event of uncharging 296rss - # of bytes of anonymous and swap cache memory.
298 active_anon - # of pages on active lru of anon, shmem. 297pgpgin - # of pages paged in (equivalent to # of charging events).
299 inactive_anon - # of pages on active lru of anon, shmem 298pgpgout - # of pages paged out (equivalent to # of uncharging events).
300 active_file - # of pages on active lru of file-cache 299active_anon - # of bytes of anonymous and swap cache memory on active
301 inactive_file - # of pages on inactive lru of file cache 300 lru list.
302 unevictable - # of pages cannot be reclaimed.(mlocked etc) 301inactive_anon - # of bytes of anonymous memory and swap cache memory on
303 302 inactive lru list.
304 Below is depend on CONFIG_DEBUG_VM. 303active_file - # of bytes of file-backed memory on active lru list.
305 inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) 304inactive_file - # of bytes of file-backed memory on inactive lru list.
306 recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 305unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
307 recent_rotated_file - VM internal parameter. (see mm/vmscan.c) 306
308 recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) 307The following additional stats are dependent on CONFIG_DEBUG_VM.
309 recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 308
310 309inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
311 Memo: 310recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
311recent_rotated_file - VM internal parameter. (see mm/vmscan.c)
312recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
313recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
314
315Memo:
312 recent_rotated means recent frequency of lru rotation. 316 recent_rotated means recent frequency of lru rotation.
313 recent_scanned means recent # of scans to lru. 317 recent_scanned means recent # of scans to lru.
314 showing for better debug please see the code for meanings. 318 showing for better debug please see the code for meanings.
315 319
320Note:
321 Only anonymous and swap cache memory is listed as part of 'rss' stat.
322 This should not be confused with the true 'resident set size' or the
323 amount of physical memory used by the cgroup. Per-cgroup rss
324 accounting is not done yet.
316 325
3175.3 swappiness 3265.3 swappiness
318 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 327 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
319 328
320 Following cgroup's swapiness can't be changed. 329 Following cgroups' swapiness can't be changed.
321 - root cgroup (uses /proc/sys/vm/swappiness). 330 - root cgroup (uses /proc/sys/vm/swappiness).
322 - a cgroup which uses hierarchy and it has child cgroup. 331 - a cgroup which uses hierarchy and it has child cgroup.
323 - a cgroup which uses hierarchy and not the root of hierarchy. 332 - a cgroup which uses hierarchy and not the root of hierarchy.
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index f196ac1d7d25..95b24d766eab 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -47,13 +47,18 @@ to work with it.
47 47
482. Basic accounting routines 482. Basic accounting routines
49 49
50 a. void res_counter_init(struct res_counter *rc) 50 a. void res_counter_init(struct res_counter *rc,
51 struct res_counter *rc_parent)
51 52
52 Initializes the resource counter. As usual, should be the first 53 Initializes the resource counter. As usual, should be the first
53 routine called for a new counter. 54 routine called for a new counter.
54 55
55 b. int res_counter_charge[_locked] 56 The struct res_counter *parent can be used to define a hierarchical
56 (struct res_counter *rc, unsigned long val) 57 child -> parent relationship directly in the res_counter structure,
58 NULL can be used to define no relationship.
59
60 c. int res_counter_charge(struct res_counter *rc, unsigned long val,
61 struct res_counter **limit_fail_at)
57 62
58 When a resource is about to be allocated it has to be accounted 63 When a resource is about to be allocated it has to be accounted
59 with the appropriate resource counter (controller should determine 64 with the appropriate resource counter (controller should determine
@@ -67,15 +72,25 @@ to work with it.
67 * if the charging is performed first, then it should be uncharged 72 * if the charging is performed first, then it should be uncharged
68 on error path (if the one is called). 73 on error path (if the one is called).
69 74
70 c. void res_counter_uncharge[_locked] 75 If the charging fails and a hierarchical dependency exists, the
76 limit_fail_at parameter is set to the particular res_counter element
77 where the charging failed.
78
79 d. int res_counter_charge_locked
80 (struct res_counter *rc, unsigned long val)
81
82 The same as res_counter_charge(), but it must not acquire/release the
83 res_counter->lock internally (it must be called with res_counter->lock
84 held).
85
86 e. void res_counter_uncharge[_locked]
71 (struct res_counter *rc, unsigned long val) 87 (struct res_counter *rc, unsigned long val)
72 88
73 When a resource is released (freed) it should be de-accounted 89 When a resource is released (freed) it should be de-accounted
74 from the resource counter it was accounted to. This is called 90 from the resource counter it was accounted to. This is called
75 "uncharging". 91 "uncharging".
76 92
77 The _locked routines imply that the res_counter->lock is taken. 93 The _locked routines imply that the res_counter->lock is taken.
78
79 94
80 2.1 Other accounting routines 95 2.1 Other accounting routines
81 96
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index 5b0cfa67aff9..ce73f3eb5ddb 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -117,10 +117,28 @@ accessible parameters:
117sampling_rate: measured in uS (10^-6 seconds), this is how often you 117sampling_rate: measured in uS (10^-6 seconds), this is how often you
118want the kernel to look at the CPU usage and to make decisions on 118want the kernel to look at the CPU usage and to make decisions on
119what to do about the frequency. Typically this is set to values of 119what to do about the frequency. Typically this is set to values of
120around '10000' or more. 120around '10000' or more. It's default value is (cmp. with users-guide.txt):
121 121transition_latency * 1000
122show_sampling_rate_(min|max): the minimum and maximum sampling rates 122The lowest value you can set is:
123available that you may set 'sampling_rate' to. 123transition_latency * 100 or it may get restricted to a value where it
124makes not sense for the kernel anymore to poll that often which depends
125on your HZ config variable (HZ=1000: max=20000us, HZ=250: max=5000).
126Be aware that transition latency is in ns and sampling_rate is in us, so you
127get the same sysfs value by default.
128Sampling rate should always get adjusted considering the transition latency
129To set the sampling rate 750 times as high as the transition latency
130in the bash (as said, 1000 is default), do:
131echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \
132 >ondemand/sampling_rate
133
134show_sampling_rate_(min|max): THIS INTERFACE IS DEPRECATED, DON'T USE IT.
135You can use wider ranges now and the general
136cpuinfo_transition_latency variable (cmp. with user-guide.txt) can be
137used to obtain exactly the same info:
138show_sampling_rate_min = transtition_latency * 500 / 1000
139show_sampling_rate_max = transtition_latency * 500000 / 1000
140(divided by 1000 is to illustrate that sampling rate is in us and
141transition latency is exported ns).
124 142
125up_threshold: defines what the average CPU usage between the samplings 143up_threshold: defines what the average CPU usage between the samplings
126of 'sampling_rate' needs to be for the kernel to make a decision on 144of 'sampling_rate' needs to be for the kernel to make a decision on
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
index 917918f84fc7..75f41193f3e1 100644
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -152,6 +152,18 @@ cpuinfo_min_freq : this file shows the minimum operating
152 frequency the processor can run at(in kHz) 152 frequency the processor can run at(in kHz)
153cpuinfo_max_freq : this file shows the maximum operating 153cpuinfo_max_freq : this file shows the maximum operating
154 frequency the processor can run at(in kHz) 154 frequency the processor can run at(in kHz)
155cpuinfo_transition_latency The time it takes on this CPU to
156 switch between two frequencies in nano
157 seconds. If unknown or known to be
158 that high that the driver does not
159 work with the ondemand governor, -1
160 (CPUFREQ_ETERNAL) will be returned.
161 Using this information can be useful
162 to choose an appropriate polling
163 frequency for a kernel governor or
164 userspace daemon. Make sure to not
165 switch the frequency too often
166 resulting in performance loss.
155scaling_driver : this file shows what cpufreq driver is 167scaling_driver : this file shows what cpufreq driver is
156 used to set the frequency on this CPU 168 used to set the frequency on this CPU
157 169
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 45932ec21cee..b41f3e58aefa 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
18these macros in include/asm-XXX/topology.h: 18these macros in include/asm-XXX/topology.h:
19#define topology_physical_package_id(cpu) 19#define topology_physical_package_id(cpu)
20#define topology_core_id(cpu) 20#define topology_core_id(cpu)
21#define topology_thread_siblings(cpu) 21#define topology_thread_cpumask(cpu)
22#define topology_core_siblings(cpu) 22#define topology_core_cpumask(cpu)
23 23
24The type of **_id is int. 24The type of **_id is int.
25The type of siblings is cpumask_t. 25The type of siblings is (const) struct cpumask *.
26 26
27To be consistent on all architectures, include/linux/topology.h 27To be consistent on all architectures, include/linux/topology.h
28provides default definitions for any of the above macros that are 28provides default definitions for any of the above macros that are
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 2be08240ee80..53d64d382343 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -1,9 +1,9 @@
1 1
2 LINUX ALLOCATED DEVICES (2.6+ version) 2 LINUX ALLOCATED DEVICES (2.6+ version)
3 3
4 Maintained by Torben Mathiasen <device@lanana.org> 4 Maintained by Alan Cox <device@lanana.org>
5 5
6 Last revised: 29 November 2006 6 Last revised: 6th April 2009
7 7
8This list is the Linux Device List, the official registry of allocated 8This list is the Linux Device List, the official registry of allocated
9device numbers and /dev directory nodes for the Linux operating 9device numbers and /dev directory nodes for the Linux operating
@@ -67,6 +67,11 @@ up to date. Due to the number of registrations I have to maintain it
67in "batch mode", so there is likely additional registrations that 67in "batch mode", so there is likely additional registrations that
68haven't been listed yet. 68haven't been listed yet.
69 69
70Fourth, remember that Linux now has extensive support for dynamic allocation
71of device numbering and can use sysfs and udev to handle the naming needs.
72There are still some exceptions in the serial and boot device area. Before
73asking for a device number make sure you actually need one.
74
70Finally, sometimes I have to play "namespace police." Please don't be 75Finally, sometimes I have to play "namespace police." Please don't be
71offended. I often get submissions for /dev names that would be bound 76offended. I often get submissions for /dev names that would be bound
72to cause conflicts down the road. I am trying to avoid getting in a 77to cause conflicts down the road. I am trying to avoid getting in a
@@ -101,7 +106,7 @@ Your cooperation is appreciated.
101 0 = /dev/ram0 First RAM disk 106 0 = /dev/ram0 First RAM disk
102 1 = /dev/ram1 Second RAM disk 107 1 = /dev/ram1 Second RAM disk
103 ... 108 ...
104 250 = /dev/initrd Initial RAM disk {2.6} 109 250 = /dev/initrd Initial RAM disk
105 110
106 Older kernels had /dev/ramdisk (1, 1) here. 111 Older kernels had /dev/ramdisk (1, 1) here.
107 /dev/initrd refers to a RAM disk which was preloaded 112 /dev/initrd refers to a RAM disk which was preloaded
@@ -340,7 +345,7 @@ Your cooperation is appreciated.
340 14 = /dev/touchscreen/ucb1x00 UCB 1x00 touchscreen 345 14 = /dev/touchscreen/ucb1x00 UCB 1x00 touchscreen
341 15 = /dev/touchscreen/mk712 MK712 touchscreen 346 15 = /dev/touchscreen/mk712 MK712 touchscreen
342 128 = /dev/beep Fancy beep device 347 128 = /dev/beep Fancy beep device
343 129 = /dev/modreq Kernel module load request {2.6} 348 129 =
344 130 = /dev/watchdog Watchdog timer port 349 130 = /dev/watchdog Watchdog timer port
345 131 = /dev/temperature Machine internal temperature 350 131 = /dev/temperature Machine internal temperature
346 132 = /dev/hwtrap Hardware fault trap 351 132 = /dev/hwtrap Hardware fault trap
@@ -350,10 +355,10 @@ Your cooperation is appreciated.
350 139 = /dev/openprom SPARC OpenBoot PROM 355 139 = /dev/openprom SPARC OpenBoot PROM
351 140 = /dev/relay8 Berkshire Products Octal relay card 356 140 = /dev/relay8 Berkshire Products Octal relay card
352 141 = /dev/relay16 Berkshire Products ISO-16 relay card 357 141 = /dev/relay16 Berkshire Products ISO-16 relay card
353 142 = /dev/msr x86 model-specific registers {2.6} 358 142 =
354 143 = /dev/pciconf PCI configuration space 359 143 = /dev/pciconf PCI configuration space
355 144 = /dev/nvram Non-volatile configuration RAM 360 144 = /dev/nvram Non-volatile configuration RAM
356 145 = /dev/hfmodem Soundcard shortwave modem control {2.6} 361 145 = /dev/hfmodem Soundcard shortwave modem control
357 146 = /dev/graphics Linux/SGI graphics device 362 146 = /dev/graphics Linux/SGI graphics device
358 147 = /dev/opengl Linux/SGI OpenGL pipe 363 147 = /dev/opengl Linux/SGI OpenGL pipe
359 148 = /dev/gfx Linux/SGI graphics effects device 364 148 = /dev/gfx Linux/SGI graphics effects device
@@ -435,6 +440,9 @@ Your cooperation is appreciated.
435 228 = /dev/hpet HPET driver 440 228 = /dev/hpet HPET driver
436 229 = /dev/fuse Fuse (virtual filesystem in user-space) 441 229 = /dev/fuse Fuse (virtual filesystem in user-space)
437 230 = /dev/midishare MidiShare driver 442 230 = /dev/midishare MidiShare driver
443 231 = /dev/snapshot System memory snapshot device
444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions)
445 233 = /dev/kmview View-OS A process with a view
438 240-254 Reserved for local use 446 240-254 Reserved for local use
439 255 Reserved for MISC_DYNAMIC_MINOR 447 255 Reserved for MISC_DYNAMIC_MINOR
440 448
@@ -466,10 +474,7 @@ Your cooperation is appreciated.
466 The device names specified are proposed -- if there 474 The device names specified are proposed -- if there
467 are "standard" names for these devices, please let me know. 475 are "standard" names for these devices, please let me know.
468 476
469 12 block MSCDEX CD-ROM callback support {2.6} 477 12 block
470 0 = /dev/dos_cd0 First MSCDEX CD-ROM
471 1 = /dev/dos_cd1 Second MSCDEX CD-ROM
472 ...
473 478
474 13 char Input core 479 13 char Input core
475 0 = /dev/input/js0 First joystick 480 0 = /dev/input/js0 First joystick
@@ -498,7 +503,7 @@ Your cooperation is appreciated.
498 2 = /dev/midi00 First MIDI port 503 2 = /dev/midi00 First MIDI port
499 3 = /dev/dsp Digital audio 504 3 = /dev/dsp Digital audio
500 4 = /dev/audio Sun-compatible digital audio 505 4 = /dev/audio Sun-compatible digital audio
501 6 = /dev/sndstat Sound card status information {2.6} 506 6 =
502 7 = /dev/audioctl SPARC audio control device 507 7 = /dev/audioctl SPARC audio control device
503 8 = /dev/sequencer2 Sequencer -- alternate device 508 8 = /dev/sequencer2 Sequencer -- alternate device
504 16 = /dev/mixer1 Second soundcard mixer control 509 16 = /dev/mixer1 Second soundcard mixer control
@@ -510,14 +515,7 @@ Your cooperation is appreciated.
510 34 = /dev/midi02 Third MIDI port 515 34 = /dev/midi02 Third MIDI port
511 50 = /dev/midi03 Fourth MIDI port 516 50 = /dev/midi03 Fourth MIDI port
512 517
513 14 block BIOS harddrive callback support {2.6} 518 14 block
514 0 = /dev/dos_hda First BIOS harddrive whole disk
515 64 = /dev/dos_hdb Second BIOS harddrive whole disk
516 128 = /dev/dos_hdc Third BIOS harddrive whole disk
517 192 = /dev/dos_hdd Fourth BIOS harddrive whole disk
518
519 Partitions are handled in the same way as IDE disks
520 (see major number 3).
521 519
522 15 char Joystick 520 15 char Joystick
523 0 = /dev/js0 First analog joystick 521 0 = /dev/js0 First analog joystick
@@ -535,14 +533,14 @@ Your cooperation is appreciated.
535 16 block GoldStar CD-ROM 533 16 block GoldStar CD-ROM
536 0 = /dev/gscd GoldStar CD-ROM 534 0 = /dev/gscd GoldStar CD-ROM
537 535
538 17 char Chase serial card 536 17 char OBSOLETE (was Chase serial card)
539 0 = /dev/ttyH0 First Chase port 537 0 = /dev/ttyH0 First Chase port
540 1 = /dev/ttyH1 Second Chase port 538 1 = /dev/ttyH1 Second Chase port
541 ... 539 ...
542 17 block Optics Storage CD-ROM 540 17 block Optics Storage CD-ROM
543 0 = /dev/optcd Optics Storage CD-ROM 541 0 = /dev/optcd Optics Storage CD-ROM
544 542
545 18 char Chase serial card - alternate devices 543 18 char OBSOLETE (was Chase serial card - alternate devices)
546 0 = /dev/cuh0 Callout device for ttyH0 544 0 = /dev/cuh0 Callout device for ttyH0
547 1 = /dev/cuh1 Callout device for ttyH1 545 1 = /dev/cuh1 Callout device for ttyH1
548 ... 546 ...
@@ -644,8 +642,7 @@ Your cooperation is appreciated.
644 2 = /dev/sbpcd2 Panasonic CD-ROM controller 0 unit 2 642 2 = /dev/sbpcd2 Panasonic CD-ROM controller 0 unit 2
645 3 = /dev/sbpcd3 Panasonic CD-ROM controller 0 unit 3 643 3 = /dev/sbpcd3 Panasonic CD-ROM controller 0 unit 3
646 644
647 26 char Quanta WinVision frame grabber {2.6} 645 26 char
648 0 = /dev/wvisfgrab Quanta WinVision frame grabber
649 646
650 26 block Second Matsushita (Panasonic/SoundBlaster) CD-ROM 647 26 block Second Matsushita (Panasonic/SoundBlaster) CD-ROM
651 0 = /dev/sbpcd4 Panasonic CD-ROM controller 1 unit 0 648 0 = /dev/sbpcd4 Panasonic CD-ROM controller 1 unit 0
@@ -872,7 +869,7 @@ Your cooperation is appreciated.
872 and "user level packet I/O." This board is also 869 and "user level packet I/O." This board is also
873 accessible as a standard networking "eth" device. 870 accessible as a standard networking "eth" device.
874 871
875 38 block Reserved for Linux/AP+ 872 38 block OBSOLETE (was Linux/AP+)
876 873
877 39 char ML-16P experimental I/O board 874 39 char ML-16P experimental I/O board
878 0 = /dev/ml16pa-a0 First card, first analog channel 875 0 = /dev/ml16pa-a0 First card, first analog channel
@@ -892,29 +889,16 @@ Your cooperation is appreciated.
892 50 = /dev/ml16pb-c1 Second card, second counter/timer 889 50 = /dev/ml16pb-c1 Second card, second counter/timer
893 51 = /dev/ml16pb-c2 Second card, third counter/timer 890 51 = /dev/ml16pb-c2 Second card, third counter/timer
894 ... 891 ...
895 39 block Reserved for Linux/AP+ 892 39 block
896 893
897 40 char Matrox Meteor frame grabber {2.6} 894 40 char
898 0 = /dev/mmetfgrab Matrox Meteor frame grabber
899 895
900 40 block Syquest EZ135 parallel port removable drive 896 40 block
901 0 = /dev/eza Parallel EZ135 drive, whole disk
902
903 This device is obsolete and will be removed in a
904 future version of Linux. It has been replaced with
905 the parallel port IDE disk driver at major number 45.
906 Partitions are handled in the same way as IDE disks
907 (see major number 3).
908 897
909 41 char Yet Another Micro Monitor 898 41 char Yet Another Micro Monitor
910 0 = /dev/yamm Yet Another Micro Monitor 899 0 = /dev/yamm Yet Another Micro Monitor
911 900
912 41 block MicroSolutions BackPack parallel port CD-ROM 901 41 block
913 0 = /dev/bpcd BackPack CD-ROM
914
915 This device is obsolete and will be removed in a
916 future version of Linux. It has been replaced with
917 the parallel port ATAPI CD-ROM driver at major number 46.
918 902
919 42 char Demo/sample use 903 42 char Demo/sample use
920 904
@@ -1681,13 +1665,7 @@ Your cooperation is appreciated.
1681 disks (see major number 3) except that the limit on 1665 disks (see major number 3) except that the limit on
1682 partitions is 15. 1666 partitions is 15.
1683 1667
1684 93 char IBM Smart Capture Card frame grabber {2.6} 1668 93 char
1685 0 = /dev/iscc0 First Smart Capture Card
1686 1 = /dev/iscc1 Second Smart Capture Card
1687 ...
1688 128 = /dev/isccctl0 First Smart Capture Card control
1689 129 = /dev/isccctl1 Second Smart Capture Card control
1690 ...
1691 1669
1692 93 block NAND Flash Translation Layer filesystem 1670 93 block NAND Flash Translation Layer filesystem
1693 0 = /dev/nftla First NFTL layer 1671 0 = /dev/nftla First NFTL layer
@@ -1695,10 +1673,7 @@ Your cooperation is appreciated.
1695 ... 1673 ...
1696 240 = /dev/nftlp 16th NTFL layer 1674 240 = /dev/nftlp 16th NTFL layer
1697 1675
1698 94 char miroVIDEO DC10/30 capture/playback device {2.6} 1676 94 char
1699 0 = /dev/dcxx0 First capture card
1700 1 = /dev/dcxx1 Second capture card
1701 ...
1702 1677
1703 94 block IBM S/390 DASD block storage 1678 94 block IBM S/390 DASD block storage
1704 0 = /dev/dasda First DASD device, major 1679 0 = /dev/dasda First DASD device, major
@@ -1791,11 +1766,7 @@ Your cooperation is appreciated.
1791 ... 1766 ...
1792 15 = /dev/amiraid/ar?p15 15th partition 1767 15 = /dev/amiraid/ar?p15 15th partition
1793 1768
1794102 char Philips SAA5249 Teletext signal decoder {2.6} 1769102 char
1795 0 = /dev/tlk0 First Teletext decoder
1796 1 = /dev/tlk1 Second Teletext decoder
1797 2 = /dev/tlk2 Third Teletext decoder
1798 3 = /dev/tlk3 Fourth Teletext decoder
1799 1770
1800102 block Compressed block device 1771102 block Compressed block device
1801 0 = /dev/cbd/a First compressed block device, whole device 1772 0 = /dev/cbd/a First compressed block device, whole device
@@ -1916,10 +1887,7 @@ Your cooperation is appreciated.
1916 DAC960 (see major number 48) except that the limit on 1887 DAC960 (see major number 48) except that the limit on
1917 partitions is 15. 1888 partitions is 15.
1918 1889
1919111 char Philips SAA7146-based audio/video card {2.6} 1890111 char
1920 0 = /dev/av0 First A/V card
1921 1 = /dev/av1 Second A/V card
1922 ...
1923 1891
1924111 block Compaq Next Generation Drive Array, eighth controller 1892111 block Compaq Next Generation Drive Array, eighth controller
1925 0 = /dev/cciss/c7d0 First logical drive, whole disk 1893 0 = /dev/cciss/c7d0 First logical drive, whole disk
@@ -2079,8 +2047,8 @@ Your cooperation is appreciated.
2079 ... 2047 ...
2080 2048
2081119 char VMware virtual network control 2049119 char VMware virtual network control
2082 0 = /dev/vmnet0 1st virtual network 2050 0 = /dev/vnet0 1st virtual network
2083 1 = /dev/vmnet1 2nd virtual network 2051 1 = /dev/vnet1 2nd virtual network
2084 ... 2052 ...
2085 2053
2086120-127 char LOCAL/EXPERIMENTAL USE 2054120-127 char LOCAL/EXPERIMENTAL USE
@@ -2450,7 +2418,7 @@ Your cooperation is appreciated.
2450 2 = /dev/raw/raw2 Second raw I/O device 2418 2 = /dev/raw/raw2 Second raw I/O device
2451 ... 2419 ...
2452 2420
2453163 char UNASSIGNED (was Radio Tech BIM-XXX-RS232 radio modem - see 51) 2421163 char
2454 2422
2455164 char Chase Research AT/PCI-Fast serial card 2423164 char Chase Research AT/PCI-Fast serial card
2456 0 = /dev/ttyCH0 AT/PCI-Fast board 0, port 0 2424 0 = /dev/ttyCH0 AT/PCI-Fast board 0, port 0
@@ -2542,6 +2510,12 @@ Your cooperation is appreciated.
2542 1 = /dev/clanvi1 Second cLAN adapter 2510 1 = /dev/clanvi1 Second cLAN adapter
2543 ... 2511 ...
2544 2512
2513179 block MMC block devices
2514 0 = /dev/mmcblk0 First SD/MMC card
2515 1 = /dev/mmcblk0p1 First partition on first MMC card
2516 8 = /dev/mmcblk1 Second SD/MMC card
2517 ...
2518
2545179 char CCube DVXChip-based PCI products 2519179 char CCube DVXChip-based PCI products
2546 0 = /dev/dvxirq0 First DVX device 2520 0 = /dev/dvxirq0 First DVX device
2547 1 = /dev/dvxirq1 Second DVX device 2521 1 = /dev/dvxirq1 Second DVX device
@@ -2560,6 +2534,9 @@ Your cooperation is appreciated.
2560 96 = /dev/usb/hiddev0 1st USB HID device 2534 96 = /dev/usb/hiddev0 1st USB HID device
2561 ... 2535 ...
2562 111 = /dev/usb/hiddev15 16th USB HID device 2536 111 = /dev/usb/hiddev15 16th USB HID device
2537 112 = /dev/usb/auer0 1st auerswald ISDN device
2538 ...
2539 127 = /dev/usb/auer15 16th auerswald ISDN device
2563 128 = /dev/usb/brlvgr0 First Braille Voyager device 2540 128 = /dev/usb/brlvgr0 First Braille Voyager device
2564 ... 2541 ...
2565 131 = /dev/usb/brlvgr3 Fourth Braille Voyager device 2542 131 = /dev/usb/brlvgr3 Fourth Braille Voyager device
@@ -2810,6 +2787,20 @@ Your cooperation is appreciated.
2810 ... 2787 ...
2811 190 = /dev/ttyUL3 Xilinx uartlite - port 3 2788 190 = /dev/ttyUL3 Xilinx uartlite - port 3
2812 191 = /dev/xvc0 Xen virtual console - port 0 2789 191 = /dev/xvc0 Xen virtual console - port 0
2790 192 = /dev/ttyPZ0 pmac_zilog - port 0
2791 ...
2792 195 = /dev/ttyPZ3 pmac_zilog - port 3
2793 196 = /dev/ttyTX0 TX39/49 serial port 0
2794 ...
2795 204 = /dev/ttyTX7 TX39/49 serial port 7
2796 205 = /dev/ttySC0 SC26xx serial port 0
2797 206 = /dev/ttySC1 SC26xx serial port 1
2798 207 = /dev/ttySC2 SC26xx serial port 2
2799 208 = /dev/ttySC3 SC26xx serial port 3
2800 209 = /dev/ttyMAX0 MAX3100 serial port 0
2801 210 = /dev/ttyMAX1 MAX3100 serial port 1
2802 211 = /dev/ttyMAX2 MAX3100 serial port 2
2803 212 = /dev/ttyMAX3 MAX3100 serial port 3
2813 2804
2814205 char Low-density serial ports (alternate device) 2805205 char Low-density serial ports (alternate device)
2815 0 = /dev/culu0 Callout device for ttyLU0 2806 0 = /dev/culu0 Callout device for ttyLU0
@@ -3145,6 +3136,20 @@ Your cooperation is appreciated.
3145 1 = /dev/blockrom1 Second ROM card's translation layer interface 3136 1 = /dev/blockrom1 Second ROM card's translation layer interface
3146 ... 3137 ...
3147 3138
3139259 block Block Extended Major
3140 Used dynamically to hold additional partition minor
3141 numbers and allow large numbers of partitions per device
3142
3143259 char FPGA configuration interfaces
3144 0 = /dev/icap0 First Xilinx internal configuration
3145 1 = /dev/icap1 Second Xilinx internal configuration
3146
3147260 char OSD (Object-based-device) SCSI Device
3148 0 = /dev/osd0 First OSD Device
3149 1 = /dev/osd1 Second OSD Device
3150 ...
3151 255 = /dev/osd255 256th OSD Device
3152
3148 **** ADDITIONAL /dev DIRECTORY ENTRIES 3153 **** ADDITIONAL /dev DIRECTORY ENTRIES
3149 3154
3150This section details additional entries that should or may exist in 3155This section details additional entries that should or may exist in
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 1e89a51ea49b..88519daab6e9 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,7 +62,6 @@ aic7*reg_print.c*
62aic7*seq.h* 62aic7*seq.h*
63aicasm 63aicasm
64aicdb.h* 64aicdb.h*
65asm
66asm-offsets.h 65asm-offsets.h
67asm_offsets.h 66asm_offsets.h
68autoconf.h* 67autoconf.h*
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index 83009fdcbbc8..2e2c2ea90ceb 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -169,3 +169,62 @@ three different ways to find such a match:
169 be probed later if another device registers. (Which is OK, since 169 be probed later if another device registers. (Which is OK, since
170 this interface is only for use with non-hotpluggable devices.) 170 this interface is only for use with non-hotpluggable devices.)
171 171
172
173Early Platform Devices and Drivers
174~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
175The early platform interfaces provide platform data to platform device
176drivers early on during the system boot. The code is built on top of the
177early_param() command line parsing and can be executed very early on.
178
179Example: "earlyprintk" class early serial console in 6 steps
180
1811. Registering early platform device data
182~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183The architecture code registers platform device data using the function
184early_platform_add_devices(). In the case of early serial console this
185should be hardware configuration for the serial port. Devices registered
186at this point will later on be matched against early platform drivers.
187
1882. Parsing kernel command line
189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
190The architecture code calls parse_early_param() to parse the kernel
191command line. This will execute all matching early_param() callbacks.
192User specified early platform devices will be registered at this point.
193For the early serial console case the user can specify port on the
194kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
195the class string, "serial" is the name of the platfrom driver and
1960 is the platform device id. If the id is -1 then the dot and the
197id can be omitted.
198
1993. Installing early platform drivers belonging to a certain class
200~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
201The architecture code may optionally force registration of all early
202platform drivers belonging to a certain class using the function
203early_platform_driver_register_all(). User specified devices from
204step 2 have priority over these. This step is omitted by the serial
205driver example since the early serial driver code should be disabled
206unless the user has specified port on the kernel command line.
207
2084. Early platform driver registration
209~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
210Compiled-in platform drivers making use of early_platform_init() are
211automatically registered during step 2 or 3. The serial driver example
212should use early_platform_init("earlyprintk", &platform_driver).
213
2145. Probing of early platform drivers belonging to a certain class
215~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216The architecture code calls early_platform_driver_probe() to match
217registered early platform devices associated with a certain class with
218registered early platform drivers. Matched devices will get probed().
219This step can be executed at any point during the early boot. As soon
220as possible may be good for the serial port case.
221
2226. Inside the early platform driver probe()
223~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
224The driver code needs to take special care during early boot, especially
225when it comes to memory allocation and interrupt registration. The code
226in the probe() function can use is_early_platform_device() to check if
227it is called at early platform device or at the regular platform device
228time. The early serial driver performs register_console() at this point.
229
230For further information, see <linux/platform_device.h>.
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index f2e908d7f90d..2f21ecd4c205 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -25,7 +25,7 @@ use IO::Handle;
25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t", 25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t",
26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", 26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
27 "or51211", "or51132_qam", "or51132_vsb", "bluebird", 27 "or51211", "or51132_qam", "or51132_vsb", "bluebird",
28 "opera1"); 28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2" );
29 29
30# Check args 30# Check args
31syntax() if (scalar(@ARGV) != 1); 31syntax() if (scalar(@ARGV) != 1);
@@ -37,8 +37,8 @@ for ($i=0; $i < scalar(@components); $i++) {
37 $outfile = eval($cid); 37 $outfile = eval($cid);
38 die $@ if $@; 38 die $@ if $@;
39 print STDERR <<EOF; 39 print STDERR <<EOF;
40Firmware $outfile extracted successfully. 40Firmware(s) $outfile extracted successfully.
41Now copy it to either /usr/lib/hotplug/firmware or /lib/firmware 41Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware
42(depending on configuration of firmware hotplug). 42(depending on configuration of firmware hotplug).
43EOF 43EOF
44 exit(0); 44 exit(0);
@@ -345,6 +345,85 @@ sub or51211 {
345 $fwfile; 345 $fwfile;
346} 346}
347 347
348sub cx231xx {
349 my $fwfile = "v4l-cx231xx-avcore-01.fw";
350 my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
351 my $hash = "7d3bb956dc9df0eafded2b56ba57cc42";
352
353 checkstandard();
354
355 wgetfile($fwfile, $url);
356 verify($fwfile, $hash);
357
358 $fwfile;
359}
360
361sub cx18 {
362 my $url = "http://linuxtv.org/downloads/firmware/";
363
364 my %files = (
365 'v4l-cx23418-apu.fw' => '588f081b562f5c653a3db1ad8f65939a',
366 'v4l-cx23418-cpu.fw' => 'b6c7ed64bc44b1a6e0840adaeac39d79',
367 'v4l-cx23418-dig.fw' => '95bc688d3e7599fd5800161e9971cc55',
368 );
369
370 checkstandard();
371
372 my $allfiles;
373 foreach my $fwfile (keys %files) {
374 wgetfile($fwfile, "$url/$fwfile");
375 verify($fwfile, $files{$fwfile});
376 $allfiles .= " $fwfile";
377 }
378
379 $allfiles =~ s/^\s//;
380
381 $allfiles;
382}
383
384sub cx23885 {
385 my $url = "http://linuxtv.org/downloads/firmware/";
386
387 my %files = (
388 'v4l-cx23885-avcore-01.fw' => 'a9f8f5d901a7fb42f552e1ee6384f3bb',
389 'v4l-cx23885-enc.fw' => 'a9f8f5d901a7fb42f552e1ee6384f3bb',
390 );
391
392 checkstandard();
393
394 my $allfiles;
395 foreach my $fwfile (keys %files) {
396 wgetfile($fwfile, "$url/$fwfile");
397 verify($fwfile, $files{$fwfile});
398 $allfiles .= " $fwfile";
399 }
400
401 $allfiles =~ s/^\s//;
402
403 $allfiles;
404}
405
406sub pvrusb2 {
407 my $url = "http://linuxtv.org/downloads/firmware/";
408
409 my %files = (
410 'v4l-cx25840.fw' => 'dadb79e9904fc8af96e8111d9cb59320',
411 );
412
413 checkstandard();
414
415 my $allfiles;
416 foreach my $fwfile (keys %files) {
417 wgetfile($fwfile, "$url/$fwfile");
418 verify($fwfile, $files{$fwfile});
419 $allfiles .= " $fwfile";
420 }
421
422 $allfiles =~ s/^\s//;
423
424 $allfiles;
425}
426
348sub or51132_qam { 427sub or51132_qam {
349 my $fwfile = "dvb-fe-or51132-qam.fw"; 428 my $fwfile = "dvb-fe-or51132-qam.fw";
350 my $url = "http://linuxtv.org/downloads/firmware/$fwfile"; 429 my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
diff --git a/Documentation/dynamic-debug-howto.txt b/Documentation/dynamic-debug-howto.txt
new file mode 100644
index 000000000000..674c5663d346
--- /dev/null
+++ b/Documentation/dynamic-debug-howto.txt
@@ -0,0 +1,240 @@
1
2Introduction
3============
4
5This document describes how to use the dynamic debug (ddebug) feature.
6
7Dynamic debug is designed to allow you to dynamically enable/disable kernel
8code to obtain additional kernel information. Currently, if
9CONFIG_DYNAMIC_DEBUG is set, then all pr_debug()/dev_debug() calls can be
10dynamically enabled per-callsite.
11
12Dynamic debug has even more useful features:
13
14 * Simple query language allows turning on and off debugging statements by
15 matching any combination of:
16
17 - source filename
18 - function name
19 - line number (including ranges of line numbers)
20 - module name
21 - format string
22
23 * Provides a debugfs control file: <debugfs>/dynamic_debug/control which can be
24 read to display the complete list of known debug statements, to help guide you
25
26Controlling dynamic debug Behaviour
27===============================
28
29The behaviour of pr_debug()/dev_debug()s are controlled via writing to a
30control file in the 'debugfs' filesystem. Thus, you must first mount the debugfs
31filesystem, in order to make use of this feature. Subsequently, we refer to the
32control file as: <debugfs>/dynamic_debug/control. For example, if you want to
33enable printing from source file 'svcsock.c', line 1603 you simply do:
34
35nullarbor:~ # echo 'file svcsock.c line 1603 +p' >
36 <debugfs>/dynamic_debug/control
37
38If you make a mistake with the syntax, the write will fail thus:
39
40nullarbor:~ # echo 'file svcsock.c wtf 1 +p' >
41 <debugfs>/dynamic_debug/control
42-bash: echo: write error: Invalid argument
43
44Viewing Dynamic Debug Behaviour
45===========================
46
47You can view the currently configured behaviour of all the debug statements
48via:
49
50nullarbor:~ # cat <debugfs>/dynamic_debug/control
51# filename:lineno [module]function flags format
52/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:323 [svcxprt_rdma]svc_rdma_cleanup - "SVCRDMA Module Removed, deregister RPC RDMA transport\012"
53/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:341 [svcxprt_rdma]svc_rdma_init - "\011max_inline : %d\012"
54/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:340 [svcxprt_rdma]svc_rdma_init - "\011sq_depth : %d\012"
55/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svc_rdma.c:338 [svcxprt_rdma]svc_rdma_init - "\011max_requests : %d\012"
56...
57
58
59You can also apply standard Unix text manipulation filters to this
60data, e.g.
61
62nullarbor:~ # grep -i rdma <debugfs>/dynamic_debug/control | wc -l
6362
64
65nullarbor:~ # grep -i tcp <debugfs>/dynamic_debug/control | wc -l
6642
67
68Note in particular that the third column shows the enabled behaviour
69flags for each debug statement callsite (see below for definitions of the
70flags). The default value, no extra behaviour enabled, is "-". So
71you can view all the debug statement callsites with any non-default flags:
72
73nullarbor:~ # awk '$3 != "-"' <debugfs>/dynamic_debug/control
74# filename:lineno [module]function flags format
75/usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c:1603 [sunrpc]svc_send p "svc_process: st_sendto returned %d\012"
76
77
78Command Language Reference
79==========================
80
81At the lexical level, a command comprises a sequence of words separated
82by whitespace characters. Note that newlines are treated as word
83separators and do *not* end a command or allow multiple commands to
84be done together. So these are all equivalent:
85
86nullarbor:~ # echo -c 'file svcsock.c line 1603 +p' >
87 <debugfs>/dynamic_debug/control
88nullarbor:~ # echo -c ' file svcsock.c line 1603 +p ' >
89 <debugfs>/dynamic_debug/control
90nullarbor:~ # echo -c 'file svcsock.c\nline 1603 +p' >
91 <debugfs>/dynamic_debug/control
92nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
93 <debugfs>/dynamic_debug/control
94
95Commands are bounded by a write() system call. If you want to do
96multiple commands you need to do a separate "echo" for each, like:
97
98nullarbor:~ # echo 'file svcsock.c line 1603 +p' > /proc/dprintk ;\
99> echo 'file svcsock.c line 1563 +p' > /proc/dprintk
100
101or even like:
102
103nullarbor:~ # (
104> echo 'file svcsock.c line 1603 +p' ;\
105> echo 'file svcsock.c line 1563 +p' ;\
106> ) > /proc/dprintk
107
108At the syntactical level, a command comprises a sequence of match
109specifications, followed by a flags change specification.
110
111command ::= match-spec* flags-spec
112
113The match-spec's are used to choose a subset of the known dprintk()
114callsites to which to apply the flags-spec. Think of them as a query
115with implicit ANDs between each pair. Note that an empty list of
116match-specs is possible, but is not very useful because it will not
117match any debug statement callsites.
118
119A match specification comprises a keyword, which controls the attribute
120of the callsite to be compared, and a value to compare against. Possible
121keywords are:
122
123match-spec ::= 'func' string |
124 'file' string |
125 'module' string |
126 'format' string |
127 'line' line-range
128
129line-range ::= lineno |
130 '-'lineno |
131 lineno'-' |
132 lineno'-'lineno
133// Note: line-range cannot contain space, e.g.
134// "1-30" is valid range but "1 - 30" is not.
135
136lineno ::= unsigned-int
137
138The meanings of each keyword are:
139
140func
141 The given string is compared against the function name
142 of each callsite. Example:
143
144 func svc_tcp_accept
145
146file
147 The given string is compared against either the full
148 pathname or the basename of the source file of each
149 callsite. Examples:
150
151 file svcsock.c
152 file /usr/src/packages/BUILD/sgi-enhancednfs-1.4/default/net/sunrpc/svcsock.c
153
154module
155 The given string is compared against the module name
156 of each callsite. The module name is the string as
157 seen in "lsmod", i.e. without the directory or the .ko
158 suffix and with '-' changed to '_'. Examples:
159
160 module sunrpc
161 module nfsd
162
163format
164 The given string is searched for in the dynamic debug format
165 string. Note that the string does not need to match the
166 entire format, only some part. Whitespace and other
167 special characters can be escaped using C octal character
168 escape \ooo notation, e.g. the space character is \040.
169 Alternatively, the string can be enclosed in double quote
170 characters (") or single quote characters (').
171 Examples:
172
173 format svcrdma: // many of the NFS/RDMA server dprintks
174 format readahead // some dprintks in the readahead cache
175 format nfsd:\040SETATTR // one way to match a format with whitespace
176 format "nfsd: SETATTR" // a neater way to match a format with whitespace
177 format 'nfsd: SETATTR' // yet another way to match a format with whitespace
178
179line
180 The given line number or range of line numbers is compared
181 against the line number of each dprintk() callsite. A single
182 line number matches the callsite line number exactly. A
183 range of line numbers matches any callsite between the first
184 and last line number inclusive. An empty first number means
185 the first line in the file, an empty line number means the
186 last number in the file. Examples:
187
188 line 1603 // exactly line 1603
189 line 1600-1605 // the six lines from line 1600 to line 1605
190 line -1605 // the 1605 lines from line 1 to line 1605
191 line 1600- // all lines from line 1600 to the end of the file
192
193The flags specification comprises a change operation followed
194by one or more flag characters. The change operation is one
195of the characters:
196
197-
198 remove the given flags
199
200+
201 add the given flags
202
203=
204 set the flags to the given flags
205
206The flags are:
207
208p
209 Causes a printk() message to be emitted to dmesg
210
211Note the regexp ^[-+=][scp]+$ matches a flags specification.
212Note also that there is no convenient syntax to remove all
213the flags at once, you need to use "-psc".
214
215Examples
216========
217
218// enable the message at line 1603 of file svcsock.c
219nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
220 <debugfs>/dynamic_debug/control
221
222// enable all the messages in file svcsock.c
223nullarbor:~ # echo -n 'file svcsock.c +p' >
224 <debugfs>/dynamic_debug/control
225
226// enable all the messages in the NFS server module
227nullarbor:~ # echo -n 'module nfsd +p' >
228 <debugfs>/dynamic_debug/control
229
230// enable all 12 messages in the function svc_process()
231nullarbor:~ # echo -n 'func svc_process +p' >
232 <debugfs>/dynamic_debug/control
233
234// disable all 12 messages in the function svc_process()
235nullarbor:~ # echo -n 'func svc_process -p' >
236 <debugfs>/dynamic_debug/control
237
238// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
239nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
240 <debugfs>/dynamic_debug/control
diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
index caabbd395e61..a618fd99c9f0 100644
--- a/Documentation/fb/00-INDEX
+++ b/Documentation/fb/00-INDEX
@@ -11,8 +11,6 @@ aty128fb.txt
11 - info on the ATI Rage128 frame buffer driver. 11 - info on the ATI Rage128 frame buffer driver.
12cirrusfb.txt 12cirrusfb.txt
13 - info on the driver for Cirrus Logic chipsets. 13 - info on the driver for Cirrus Logic chipsets.
14cyblafb/
15 - directory with documentation files related to the cyblafb driver.
16deferred_io.txt 14deferred_io.txt
17 - an introduction to deferred IO. 15 - an introduction to deferred IO.
18fbcon.txt 16fbcon.txt
diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs
deleted file mode 100644
index 9443a6d72cdd..000000000000
--- a/Documentation/fb/cyblafb/bugs
+++ /dev/null
@@ -1,13 +0,0 @@
1Bugs
2====
3
4I currently don't know of any bug. Please do send reports to:
5 - linux-fbdev-devel@lists.sourceforge.net
6 - Knut_Petersen@t-online.de.
7
8
9Untested features
10=================
11
12All LCD stuff is untested. If it worked in tridentfb, it should work in
13cyblafb. Please test and report the results to Knut_Petersen@t-online.de.
diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits
deleted file mode 100644
index 0eb3b443dc2b..000000000000
--- a/Documentation/fb/cyblafb/credits
+++ /dev/null
@@ -1,7 +0,0 @@
1Thanks to
2=========
3 * Alan Hourihane, for writing the X trident driver
4 * Jani Monoses, for writing the tridentfb driver
5 * Antonino A. Daplas, for review of the first published
6 version of cyblafb and some code
7 * Jochen Hein, for testing and a helpfull bug report
diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation
deleted file mode 100644
index bb1aac048425..000000000000
--- a/Documentation/fb/cyblafb/documentation
+++ /dev/null
@@ -1,17 +0,0 @@
1Available Documentation
2=======================
3
4Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22,
52001, available from VIA:
6
7 http://www.viavpsd.com/product/6/15/DS8601A182.pdf
8
9The datasheet is incomplete, some registers that need to be programmed are not
10explained at all and important bits are listed as "reserved". But you really
11need the datasheet to understand the code. "p. xxx" comments refer to page
12numbers of this document.
13
14XFree/XOrg drivers are available and of good quality, looking at the code
15there is a good idea if the datasheet does not provide enough information
16or if the datasheet seems to be wrong.
17
diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes
deleted file mode 100644
index fe0e5223ba86..000000000000
--- a/Documentation/fb/cyblafb/fb.modes
+++ /dev/null
@@ -1,154 +0,0 @@
1#
2# Sample fb.modes file
3#
4# Provides an incomplete list of working modes for
5# the cyberblade/i1 graphics core.
6#
7# The value 4294967256 is used instead of -40. Of course, -40 is not
8# a really reasonable value, but chip design does not always follow
9# logic. Believe me, it's ok, and it's the way the BIOS does it.
10#
11# fbset requires 4294967256 in fb.modes and -40 as an argument to
12# the -t parameter. That's also not too reasonable, and it might change
13# in the future or might even be differt for your current version.
14#
15
16mode "640x480-50"
17 geometry 640 480 2048 4096 8
18 timings 47619 4294967256 24 17 0 216 3
19endmode
20
21mode "640x480-60"
22 geometry 640 480 2048 4096 8
23 timings 39682 4294967256 24 17 0 216 3
24endmode
25
26mode "640x480-70"
27 geometry 640 480 2048 4096 8
28 timings 34013 4294967256 24 17 0 216 3
29endmode
30
31mode "640x480-72"
32 geometry 640 480 2048 4096 8
33 timings 33068 4294967256 24 17 0 216 3
34endmode
35
36mode "640x480-75"
37 geometry 640 480 2048 4096 8
38 timings 31746 4294967256 24 17 0 216 3
39endmode
40
41mode "640x480-80"
42 geometry 640 480 2048 4096 8
43 timings 29761 4294967256 24 17 0 216 3
44endmode
45
46mode "640x480-85"
47 geometry 640 480 2048 4096 8
48 timings 28011 4294967256 24 17 0 216 3
49endmode
50
51mode "800x600-50"
52 geometry 800 600 2048 4096 8
53 timings 30303 96 24 14 0 136 11
54endmode
55
56mode "800x600-60"
57 geometry 800 600 2048 4096 8
58 timings 25252 96 24 14 0 136 11
59endmode
60
61mode "800x600-70"
62 geometry 800 600 2048 4096 8
63 timings 21645 96 24 14 0 136 11
64endmode
65
66mode "800x600-72"
67 geometry 800 600 2048 4096 8
68 timings 21043 96 24 14 0 136 11
69endmode
70
71mode "800x600-75"
72 geometry 800 600 2048 4096 8
73 timings 20202 96 24 14 0 136 11
74endmode
75
76mode "800x600-80"
77 geometry 800 600 2048 4096 8
78 timings 18939 96 24 14 0 136 11
79endmode
80
81mode "800x600-85"
82 geometry 800 600 2048 4096 8
83 timings 17825 96 24 14 0 136 11
84endmode
85
86mode "1024x768-50"
87 geometry 1024 768 2048 4096 8
88 timings 19054 144 24 29 0 120 3
89endmode
90
91mode "1024x768-60"
92 geometry 1024 768 2048 4096 8
93 timings 15880 144 24 29 0 120 3
94endmode
95
96mode "1024x768-70"
97 geometry 1024 768 2048 4096 8
98 timings 13610 144 24 29 0 120 3
99endmode
100
101mode "1024x768-72"
102 geometry 1024 768 2048 4096 8
103 timings 13232 144 24 29 0 120 3
104endmode
105
106mode "1024x768-75"
107 geometry 1024 768 2048 4096 8
108 timings 12703 144 24 29 0 120 3
109endmode
110
111mode "1024x768-80"
112 geometry 1024 768 2048 4096 8
113 timings 11910 144 24 29 0 120 3
114endmode
115
116mode "1024x768-85"
117 geometry 1024 768 2048 4096 8
118 timings 11209 144 24 29 0 120 3
119endmode
120
121mode "1280x1024-50"
122 geometry 1280 1024 2048 4096 8
123 timings 11114 232 16 39 0 160 3
124endmode
125
126mode "1280x1024-60"
127 geometry 1280 1024 2048 4096 8
128 timings 9262 232 16 39 0 160 3
129endmode
130
131mode "1280x1024-70"
132 geometry 1280 1024 2048 4096 8
133 timings 7939 232 16 39 0 160 3
134endmode
135
136mode "1280x1024-72"
137 geometry 1280 1024 2048 4096 8
138 timings 7719 232 16 39 0 160 3
139endmode
140
141mode "1280x1024-75"
142 geometry 1280 1024 2048 4096 8
143 timings 7410 232 16 39 0 160 3
144endmode
145
146mode "1280x1024-80"
147 geometry 1280 1024 2048 4096 8
148 timings 6946 232 16 39 0 160 3
149endmode
150
151mode "1280x1024-85"
152 geometry 1280 1024 2048 4096 8
153 timings 6538 232 16 39 0 160 3
154endmode
diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance
deleted file mode 100644
index 8d15d5dfc6b3..000000000000
--- a/Documentation/fb/cyblafb/performance
+++ /dev/null
@@ -1,79 +0,0 @@
1Speed
2=====
3
4CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data
5for mode 1280x1024-[8,16,32]@61 Hz.
6
7Test 1: Cat a file with 2000 lines of 0 characters.
8Test 2: Cat a file with 2000 lines of 80 characters.
9Test 3: Cat a file with 2000 lines of 160 characters.
10
11All values show system time use in seconds, kernel 2.6.12 was used for
12the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a
13patch that speeds up kernel bitblitting a lot ( > 20%).
14
15+-----------+-----------------------------------------------------+
16| | not accelerated |
17| TRIDENTFB +-----------------+-----------------+-----------------+
18| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
19| | noypan | ypan | noypan | ypan | noypan | ypan |
20+-----------+--------+--------+--------+--------+--------+--------+
21| Test 1 | 4.31 | 4.33 | 6.05 | 12.81 | ---- | ---- |
22| Test 2 | 67.94 | 5.44 | 123.16 | 14.79 | ---- | ---- |
23| Test 3 | 131.36 | 6.55 | 240.12 | 16.76 | ---- | ---- |
24+-----------+--------+--------+--------+--------+--------+--------+
25| Comments | | | completely bro- |
26| | | | ken, monitor |
27| | | | switches off |
28+-----------+-----------------+-----------------+-----------------+
29
30
31+-----------+-----------------------------------------------------+
32| | accelerated |
33| TRIDENTFB +-----------------+-----------------+-----------------+
34| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
35| | noypan | ypan | noypan | ypan | noypan | ypan |
36+-----------+--------+--------+--------+--------+--------+--------+
37| Test 1 | ---- | ---- | 20.62 | 1.22 | ---- | ---- |
38| Test 2 | ---- | ---- | 22.61 | 3.19 | ---- | ---- |
39| Test 3 | ---- | ---- | 24.59 | 5.16 | ---- | ---- |
40+-----------+--------+--------+--------+--------+--------+--------+
41| Comments | broken, writing | broken, ok only | completely bro- |
42| | to wrong places | if bgcolor is | ken, monitor |
43| | on screen + bug | black, bug in | switches off |
44| | in fillrect() | fillrect() | |
45+-----------+-----------------+-----------------+-----------------+
46
47
48+-----------+-----------------------------------------------------+
49| | not accelerated |
50| VESAFB +-----------------+-----------------+-----------------+
51| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
52| | noypan | ypan | noypan | ypan | noypan | ypan |
53+-----------+--------+--------+--------+--------+--------+--------+
54| Test 1 | 4.26 | 3.76 | 5.99 | 7.23 | ---- | ---- |
55| Test 2 | 65.65 | 4.89 | 120.88 | 9.08 | ---- | ---- |
56| Test 3 | 126.91 | 5.94 | 235.77 | 11.03 | ---- | ---- |
57+-----------+--------+--------+--------+--------+--------+--------+
58| Comments | vga=0x307 | vga=0x31a | vga=0x31b not |
59| | fh=80kHz | fh=80kHz | supported by |
60| | fv=75kHz | fv=75kHz | video BIOS and |
61| | | | hardware |
62+-----------+-----------------+-----------------+-----------------+
63
64
65+-----------+-----------------------------------------------------+
66| | accelerated |
67| CYBLAFB +-----------------+-----------------+-----------------+
68| | 8 bpp | 16 bpp | 32 bpp |
69| | noypan | ypan | noypan | ypan | noypan | ypan |
70+-----------+--------+--------+--------+--------+--------+--------+
71| Test 1 | 8.02 | 0.23 | 19.04 | 0.61 | 57.12 | 2.74 |
72| Test 2 | 8.38 | 0.55 | 19.39 | 0.92 | 57.54 | 3.13 |
73| Test 3 | 8.73 | 0.86 | 19.74 | 1.24 | 57.95 | 3.51 |
74+-----------+--------+--------+--------+--------+--------+--------+
75| Comments | | | |
76| | | | |
77| | | | |
78| | | | |
79+-----------+-----------------+-----------------+-----------------+
diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo
deleted file mode 100644
index c5f6d0eae545..000000000000
--- a/Documentation/fb/cyblafb/todo
+++ /dev/null
@@ -1,31 +0,0 @@
1TODO / Missing features
2=======================
3
4Verify LCD stuff "stretch" and "center" options are
5 completely untested ... this code needs to be
6 verified. As I don't have access to such
7 hardware, please contact me if you are
8 willing run some tests.
9
10Interlaced video modes The reason that interleaved
11 modes are disabled is that I do not know
12 the meaning of the vertical interlace
13 parameter. Also the datasheet mentions a
14 bit d8 of a horizontal interlace parameter,
15 but nowhere the lower 8 bits. Please help
16 if you can.
17
18low-res double scan modes Who needs it?
19
20accelerated color blitting Who needs it? The console driver does use color
21 blitting for nothing but drawing the penguine,
22 everything else is done using color expanding
23 blitting of 1bpp character bitmaps.
24
25ioctls Who needs it?
26
27TV-out Will be done later. Use "vga= " at boot time
28 to set a suitable video mode.
29
30??? Feel free to contact me if you have any
31 feature requests
diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage
deleted file mode 100644
index a39bb3d402a2..000000000000
--- a/Documentation/fb/cyblafb/usage
+++ /dev/null
@@ -1,217 +0,0 @@
1CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated
2into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and
3tested using a VIA EPIA 5000 board.
4
5Cyblafb - compiled into the kernel or as a module?
6==================================================
7
8You might compile cyblafb either as a module or compile it permanently into the
9kernel.
10
11Unless you have a real reason to do so you should not compile both vesafb and
12cyblafb permanently into the kernel. It's possible and it helps during the
13developement cycle, but it's useless and will at least block some otherwise
14usefull memory for ordinary users.
15
16Selecting Modes
17===============
18
19 Startup Mode
20 ============
21
22 First of all, you might use the "vga=???" boot parameter as it is
23 documented in vesafb.txt and svga.txt. Cyblafb will detect the video
24 mode selected and will use the geometry and timings found by
25 inspecting the hardware registers.
26
27 video=cyblafb vga=0x317
28
29 Alternatively you might use a combination of the mode, ref and bpp
30 parameters. If you compiled the driver into the kernel, add something
31 like this to the kernel command line:
32
33 video=cyblafb:1280x1024,bpp=16,ref=50 ...
34
35 If you compiled the driver as a module, the same mode would be
36 selected by the following command:
37
38 modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ...
39
40 None of the modes possible to select as startup modes are affected by
41 the problems described at the end of the next subsection.
42
43 For all startup modes cyblafb chooses a virtual x resolution of 2048,
44 the only exception is mode 1280x1024 in combination with 32 bpp. This
45 allows ywrap scrolling for all those modes if rotation is 0 or 2, and
46 also fast scrolling if rotation is 1 or 3. The default virtual y reso-
47 lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32,
48 again with the only exception of 1280x1024 at 32 bpp.
49
50 Please do set your video memory size to 8 Mb in the Bios setup. Other
51 values will work, but performace is decreased for a lot of modes.
52
53 Mode changes using fbset
54 ========================
55
56 You might use fbset to change the video mode, see "man fbset". Cyblafb
57 generally does assume that you know what you are doing. But it does
58 some checks, especially those that are needed to prevent you from
59 damaging your hardware.
60
61 - only 8, 16, 24 and 32 bpp video modes are accepted
62 - interlaced video modes are not accepted
63 - double scan video modes are not accepted
64 - if a flat panel is found, cyblafb does not allow you
65 to program a resolution higher than the physical
66 resolution of the flat panel monitor
67 - cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp
68 and (currently) 24 bit modes use a doubled vclk internally,
69 the dotclock limit as seen by fbset is 115 MHz for those
70 modes and 230 MHz for 8 and 16 bpp modes.
71 - cyblafb will allow you to select very high resolutions as
72 long as the hardware can be programmed to these modes. The
73 documented limit 1600x1200 is not enforced, but don't expect
74 perfect signal quality.
75
76 Any request that violates the rules given above will be either changed
77 to something the hardware supports or an error value will be returned.
78
79 If you program a virtual y resolution higher than the hardware limit,
80 cyblafb will silently decrease that value to the highest possible
81 value. The same is true for a virtual x resolution that is not
82 supported by the hardware. Cyblafb tries to adapt vyres first because
83 vxres decides if ywrap scrolling is possible or not.
84
85 Attempts to disable acceleration are ignored, I believe that this is
86 safe.
87
88 Some video modes that should work do not work as expected. If you use
89 the standard fb.modes, fbset 640x480-60 will program that mode, but
90 you will see a vertical area, about two characters wide, with only
91 much darker characters than the other characters on the screen.
92 Cyblafb does allow that mode to be set, as it does not violate the
93 official specifications. It would need a lot of code to reliably sort
94 out all invalid modes, playing around with the margin values will
95 give a valid mode quickly. And if cyblafb would detect such an invalid
96 mode, should it silently alter the requested values or should it
97 report an error? Both options have some pros and cons. As stated
98 above, none of the startup modes are affected, and if you set
99 verbosity to 1 or higher, cyblafb will print the fbset command that
100 would be needed to program that mode using fbset.
101
102
103Other Parameters
104================
105
106
107crt don't autodetect, assume monitor connected to
108 standard VGA connector
109
110fp don't autodetect, assume flat panel display
111 connected to flat panel monitor interface
112
113nativex inform driver about native x resolution of
114 flat panel monitor connected to special
115 interface (should be autodetected)
116
117stretch stretch image to adapt low resolution modes to
118 higer resolutions of flat panel monitors
119 connected to special interface
120
121center center image to adapt low resolution modes to
122 higer resolutions of flat panel monitors
123 connected to special interface
124
125memsize use if autodetected memsize is wrong ...
126 should never be necessary
127
128nopcirr disable PCI read retry
129nopciwr disable PCI write retry
130nopcirb disable PCI read bursts
131nopciwb disable PCI write bursts
132
133bpp bpp for specified modes
134 valid values: 8 || 16 || 24 || 32
135
136ref refresh rate for specified mode
137 valid values: 50 <= ref <= 85
138
139mode 640x480 or 800x600 or 1024x768 or 1280x1024
140 if not specified, the startup mode will be detected
141 and used, so you might also use the vga=??? parameter
142 described in vesafb.txt. If you do not specify a mode,
143 bpp and ref parameters are ignored.
144
145verbosity 0 is the default, increase to at least 2 for every
146 bug report!
147
148Development hints
149=================
150
151It's much faster do compile a module and to load the new version after
152unloading the old module than to compile a new kernel and to reboot. So if you
153try to work on cyblafb, it might be a good idea to use cyblafb as a module.
154In real life, fast often means dangerous, and that's also the case here. If
155you introduce a serious bug when cyblafb is compiled into the kernel, the
156kernel will lock or oops with a high probability before the file system is
157mounted, and the danger for your data is low. If you load a broken own version
158of cyblafb on a running system, the danger for the integrity of the file
159system is much higher as you might need a hard reset afterwards. Decide
160yourself.
161
162Module unloading, the vfb method
163================================
164
165If you want to unload/reload cyblafb using the virtual framebuffer, you need
166to enable vfb support in the kernel first. After that, load the modules as
167shown below:
168
169 modprobe vfb vfb_enable=1
170 modprobe fbcon
171 modprobe cyblafb
172 fbset -fb /dev/fb1 1280x1024-60 -vyres 2662
173 con2fb /dev/fb1 /dev/tty1
174 ...
175
176If you now made some changes to cyblafb and want to reload it, you might do it
177as show below:
178
179 con2fb /dev/fb0 /dev/tty1
180 ...
181 rmmod cyblafb
182 modprobe cyblafb
183 con2fb /dev/fb1 /dev/tty1
184 ...
185
186Of course, you might choose another mode, and most certainly you also want to
187map some other /dev/tty* to the real framebuffer device. You might also choose
188to compile fbcon as a kernel module or place it permanently in the kernel.
189
190I do not know of any way to unload fbcon, and fbcon will prevent the
191framebuffer device loaded first from unloading. [If there is a way, then
192please add a description here!]
193
194Module unloading, the vesafb method
195===================================
196
197Configure the kernel:
198
199 <*> Support for frame buffer devices
200 [*] VESA VGA graphics support
201 <M> Cyberblade/i1 support
202
203Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan
204parameter is important, choose any vga parameter you like as long as it is
205a graphics mode.
206
207After booting, load cyblafb without any mode and bpp parameter and assign
208cyblafb to individual ttys using con2fb, e.g.:
209
210 modprobe cyblafb
211 con2fb /dev/fb1 /dev/tty1
212
213Unloading cyblafb works without problems after you assign vesafb to all
214ttys again, e.g.:
215
216 con2fb /dev/fb0 /dev/tty1
217 rmmod cyblafb
diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew
deleted file mode 100644
index 76c07a26e044..000000000000
--- a/Documentation/fb/cyblafb/whatsnew
+++ /dev/null
@@ -1,29 +0,0 @@
10.62
2====
3
4 - the vesafb parameter has been removed as I decided to allow the
5 feature without any special parameter.
6
7 - Cyblafb does not use the vga style of panning any longer, now the
8 "right view" register in the graphics engine IO space is used. Without
9 that change it was impossible to use all available memory, and without
10 access to all available memory it is impossible to ywrap.
11
12 - The imageblit function now uses hardware acceleration for all font
13 widths. Hardware blitting across pixel column 2048 is broken in the
14 cyberblade/i1 graphics core, but we work around that hardware bug.
15
16 - modes with vxres != xres are supported now.
17
18 - ywrap scrolling is supported now and the default. This is a big
19 performance gain.
20
21 - default video modes use vyres > yres and vxres > xres to allow
22 almost optimal scrolling speed for normal and rotated screens
23
24 - some features mainly usefull for debugging the upper layers of the
25 framebuffer system have been added, have a look at the code
26
27 - fixed: Oops after unloading cyblafb when reading /proc/io*
28
29 - we work around some bugs of the higher framebuffer layers.
diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb
deleted file mode 100644
index a123bc11e698..000000000000
--- a/Documentation/fb/cyblafb/whycyblafb
+++ /dev/null
@@ -1,85 +0,0 @@
1I tried the following framebuffer drivers:
2
3 - TRIDENTFB is full of bugs. Acceleration is broken for Blade3D
4 graphics cores like the cyberblade/i1. It claims to support a great
5 number of devices, but documentation for most of these devices is
6 unfortunately not available. There is _no_ reason to use tridentfb
7 for cyberblade/i1 + CRT users. VESAFB is faster, and the one
8 advantage, mode switching, is broken in tridentfb.
9
10 - VESAFB is used by many distributions as a standard. Vesafb does
11 not support mode switching. VESAFB is a bit faster than the working
12 configurations of TRIDENTFB, but it is still too slow, even if you
13 use ypan.
14
15 - EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1
16 graphics core, but it still has serious bugs and developement seems
17 to have stopped. This is the one driver with TV-out support. If you
18 do need this feature, try epiafb.
19
20None of these drivers was a real option for me.
21
22I believe that is unreasonable to change code that announces to support 20
23devices if I only have more or less sufficient documentation for exactly one
24of these. The risk of breaking device foo while fixing device bar is too high.
25
26So I decided to start CyBlaFB as a stripped down tridentfb.
27
28All code specific to other Trident chips has been removed. After that there
29were a lot of cosmetic changes to increase the readability of the code. All
30register names were changed to those mnemonics used in the datasheet. Function
31and macro names were changed if they hindered easy understanding of the code.
32
33After that I debugged the code and implemented some new features. I'll try to
34give a little summary of the main changes:
35
36 - calculation of vertical and horizontal timings was fixed
37
38 - video signal quality has been improved dramatically
39
40 - acceleration:
41
42 - fillrect and copyarea were fixed and reenabled
43
44 - color expanding imageblit was newly implemented, color
45 imageblit (only used to draw the penguine) still uses the
46 generic code.
47
48 - init of the acceleration engine was improved and moved to a
49 place where it really works ...
50
51 - sync function has a timeout now and tries to reset and
52 reinit the accel engine if necessary
53
54 - fewer slow copyarea calls when doing ypan scrolling by using
55 undocumented bit d21 of screen start address stored in
56 CR2B[5]. BIOS does use it also, so this should be safe.
57
58 - cyblafb rejects any attempt to set modes that would cause vclk
59 values above reasonable 230 MHz. 32bit modes use a clock
60 multiplicator of 2, so fbset does show the correct values for
61 pixclock but not for vclk in this case. The fbset limit is 115 MHz
62 for 32 bpp modes.
63
64 - cyblafb rejects modes known to be broken or unimplemented (all
65 interlaced modes, all doublescan modes for now)
66
67 - cyblafb now works independant of the video mode in effect at startup
68 time (tridentfb does not init all needed registers to reasonable
69 values)
70
71 - switching between video modes does work reliably now
72
73 - the first video mode now is the one selected on startup using the
74 vga=???? mechanism or any of
75 - 640x480, 800x600, 1024x768, 1280x1024
76 - 8, 16, 24 or 32 bpp
77 - refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32
78 is limited to 63Hz)
79
80 - pci retry and pci burst mode are settable (try to disable if you
81 experience latency problems)
82
83 - built as a module cyblafb might be unloaded and reloaded using
84 the vfb module and con2vt or might be used together with vesafb
85
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
index 7ac3c4078ff9..eefdd91d298a 100644
--- a/Documentation/fb/uvesafb.txt
+++ b/Documentation/fb/uvesafb.txt
@@ -59,7 +59,8 @@ Accepted options:
59ypan Enable display panning using the VESA protected mode 59ypan Enable display panning using the VESA protected mode
60 interface. The visible screen is just a window of the 60 interface. The visible screen is just a window of the
61 video memory, console scrolling is done by changing the 61 video memory, console scrolling is done by changing the
62 start of the window. Available on x86 only. 62 start of the window. This option is available on x86
63 only and is the default option on that architecture.
63 64
64ywrap Same as ypan, but assumes your gfx board can wrap-around 65ywrap Same as ypan, but assumes your gfx board can wrap-around
65 the video memory (i.e. starts reading from top if it 66 the video memory (i.e. starts reading from top if it
@@ -67,7 +68,7 @@ ywrap Same as ypan, but assumes your gfx board can wrap-around
67 Available on x86 only. 68 Available on x86 only.
68 69
69redraw Scroll by redrawing the affected part of the screen, this 70redraw Scroll by redrawing the affected part of the screen, this
70 is the safe (and slow) default. 71 is the default on non-x86.
71 72
72(If you're using uvesafb as a module, the above three options are 73(If you're using uvesafb as a module, the above three options are
73 used a parameter of the scroll option, e.g. scroll=ypan.) 74 used a parameter of the scroll option, e.g. scroll=ypan.)
@@ -182,7 +183,7 @@ from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
182 183
183-- 184--
184 Michal Januszewski <spock@gentoo.org> 185 Michal Januszewski <spock@gentoo.org>
185 Last updated: 2007-06-16 186 Last updated: 2009-03-30
186 187
187 Documentation of the uvesafb options is loosely based on vesafb.txt. 188 Documentation of the uvesafb options is loosely based on vesafb.txt.
188 189
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 5ddbe350487a..de491a3e2313 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,20 +6,47 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: old static regulatory information and ieee80211_regdom module parameter 9What: The ieee80211_regdom module parameter
10When: 2.6.29 10When: March 2010 / desktop catchup
11
12Why: This was inherited by the CONFIG_WIRELESS_OLD_REGULATORY code,
13 and currently serves as an option for users to define an
14 ISO / IEC 3166 alpha2 code for the country they are currently
15 present in. Although there are userspace API replacements for this
16 through nl80211 distributions haven't yet caught up with implementing
17 decent alternatives through standard GUIs. Although available as an
18 option through iw or wpa_supplicant its just a matter of time before
19 distributions pick up good GUI options for this. The ideal solution
20 would actually consist of intelligent designs which would do this for
21 the user automatically even when travelling through different countries.
22 Until then we leave this module parameter as a compromise.
23
24 When userspace improves with reasonable widely-available alternatives for
25 this we will no longer need this module parameter. This entry hopes that
26 by the super-futuristically looking date of "March 2010" we will have
27 such replacements widely available.
28
29Who: Luis R. Rodriguez <lrodriguez@atheros.com>
30
31---------------------------
32
33What: CONFIG_WIRELESS_OLD_REGULATORY - old static regulatory information
34When: March 2010 / desktop catchup
35
11Why: The old regulatory infrastructure has been replaced with a new one 36Why: The old regulatory infrastructure has been replaced with a new one
12 which does not require statically defined regulatory domains. We do 37 which does not require statically defined regulatory domains. We do
13 not want to keep static regulatory domains in the kernel due to the 38 not want to keep static regulatory domains in the kernel due to the
14 the dynamic nature of regulatory law and localization. We kept around 39 the dynamic nature of regulatory law and localization. We kept around
15 the old static definitions for the regulatory domains of: 40 the old static definitions for the regulatory domains of:
41
16 * US 42 * US
17 * JP 43 * JP
18 * EU 44 * EU
45
19 and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was 46 and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
20 set. We also kept around the ieee80211_regdom module parameter in case 47 set. We will remove this option once the standard Linux desktop catches
21 some applications were relying on it. Changing regulatory domains 48 up with the new userspace APIs we have implemented.
22 can now be done instead by using nl80211, as is done with iw. 49
23Who: Luis R. Rodriguez <lrodriguez@atheros.com> 50Who: Luis R. Rodriguez <lrodriguez@atheros.com>
24 51
25--------------------------- 52---------------------------
@@ -37,10 +64,10 @@ Who: Pavel Machek <pavel@suse.cz>
37 64
38--------------------------- 65---------------------------
39 66
40What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. 67What: Video4Linux API 1 ioctls and from Video devices.
41When: December 2008 68When: July 2009
42Files: include/linux/video_decoder.h include/linux/videodev.h 69Files: include/linux/videodev.h
43Check: include/linux/video_decoder.h include/linux/videodev.h 70Check: include/linux/videodev.h
44Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6 71Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
45 series. The old API have lots of drawbacks and don't provide enough 72 series. The old API have lots of drawbacks and don't provide enough
46 means to work with all video and audio standards. The newer API is 73 means to work with all video and audio standards. The newer API is
@@ -228,8 +255,20 @@ Who: Jan Engelhardt <jengelh@computergmbh.de>
228 255
229--------------------------- 256---------------------------
230 257
258What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib
259When: February 2010
260Why: All callers should use explicit gpio_request()/gpio_free().
261 The autorequest mechanism in gpiolib was provided mostly as a
262 migration aid for legacy GPIO interfaces (for SOC based GPIOs).
263 Those users have now largely migrated. Platforms implementing
264 the GPIO interfaces without using gpiolib will see no changes.
265Who: David Brownell <dbrownell@users.sourceforge.net>
266---------------------------
267
231What: b43 support for firmware revision < 410 268What: b43 support for firmware revision < 410
232When: July 2008 269When: The schedule was July 2008, but it was decided that we are going to keep the
270 code as long as there are no major maintanance headaches.
271 So it _could_ be removed _any_ time now, if it conflicts with something new.
233Why: The support code for the old firmware hurts code readability/maintainability 272Why: The support code for the old firmware hurts code readability/maintainability
234 and slightly hurts runtime performance. Bugfixes for the old firmware 273 and slightly hurts runtime performance. Bugfixes for the old firmware
235 are not provided by Broadcom anymore. 274 are not provided by Broadcom anymore.
@@ -244,13 +283,6 @@ Who: Glauber Costa <gcosta@redhat.com>
244 283
245--------------------------- 284---------------------------
246 285
247What: remove HID compat support
248When: 2.6.29
249Why: needed only as a temporary solution until distros fix themselves up
250Who: Jiri Slaby <jirislaby@gmail.com>
251
252---------------------------
253
254What: print_fn_descriptor_symbol() 286What: print_fn_descriptor_symbol()
255When: October 2009 287When: October 2009
256Why: The %pF vsprintf format provides the same functionality in a 288Why: The %pF vsprintf format provides the same functionality in a
@@ -282,6 +314,18 @@ Who: Vlad Yasevich <vladislav.yasevich@hp.com>
282 314
283--------------------------- 315---------------------------
284 316
317What: Ability for non root users to shm_get hugetlb pages based on mlock
318 resource limits
319When: 2.6.31
320Why: Non root users need to be part of /proc/sys/vm/hugetlb_shm_group or
321 have CAP_IPC_LOCK to be able to allocate shm segments backed by
322 huge pages. The mlock based rlimit check to allow shm hugetlb is
323 inconsistent with mmap based allocations. Hence it is being
324 deprecated.
325Who: Ravikiran Thirumalai <kiran@scalex86.org>
326
327---------------------------
328
285What: CONFIG_THERMAL_HWMON 329What: CONFIG_THERMAL_HWMON
286When: January 2009 330When: January 2009
287Why: This option was introduced just to allow older lm-sensors userspace 331Why: This option was introduced just to allow older lm-sensors userspace
@@ -310,8 +354,10 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl>
310 354
311--------------------------- 355---------------------------
312 356
313What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client() 357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
314When: 2.6.29 (ideally) or 2.6.30 (more likely) 358 i2c_adapter->client_register(), i2c_adapter->client_unregister
359When: 2.6.30
360Check: i2c_attach_client i2c_detach_client
315Why: Deprecated by the new (standard) device driver binding model. Use 361Why: Deprecated by the new (standard) device driver binding model. Use
316 i2c_driver->probe() and ->remove() instead. 362 i2c_driver->probe() and ->remove() instead.
317Who: Jean Delvare <khali@linux-fr.org> 363Who: Jean Delvare <khali@linux-fr.org>
@@ -326,12 +372,68 @@ Who: Hans de Goede <hdegoede@redhat.com>
326 372
327--------------------------- 373---------------------------
328 374
329What: SELinux "compat_net" functionality 375What: sysfs ui for changing p4-clockmod parameters
330When: 2.6.30 at the earliest 376When: September 2009
331Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net" 377Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
332 network access control functionality of SELinux. Secmark offers both 378 e088e4c9cdb618675874becb91b2fd581ee707e6.
333 better performance and greater flexibility than the "compat_net" 379 Removal is subject to fixing any remaining bugs in ACPI which may
334 mechanism. Now that the major Linux distributions have moved to 380 cause the thermal throttling not to happen at the right time.
335 Secmark, it is time to deprecate the older mechanism and start the 381Who: Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
336 process of removing the old code. 382
337Who: Paul Moore <paul.moore@hp.com> 383-----------------------------
384
385What: __do_IRQ all in one fits nothing interrupt handler
386When: 2.6.32
387Why: __do_IRQ was kept for easy migration to the type flow handlers.
388 More than two years of migration time is enough.
389Who: Thomas Gleixner <tglx@linutronix.de>
390
391-----------------------------
392
393What: obsolete generic irq defines and typedefs
394When: 2.6.30
395Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
396 have been kept around for migration reasons. After more than two years
397 it's time to remove them finally
398Who: Thomas Gleixner <tglx@linutronix.de>
399
400---------------------------
401
402What: fakephp and associated sysfs files in /sys/bus/pci/slots/
403When: 2011
404Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
405 represent a machine's physical PCI slots. The change in semantics
406 had userspace implications, as the hotplug core no longer allowed
407 drivers to create multiple sysfs files per physical slot (required
408 for multi-function devices, e.g.). fakephp was seen as a developer's
409 tool only, and its interface changed. Too late, we learned that
410 there were some users of the fakephp interface.
411
412 In 2.6.30, the original fakephp interface was restored. At the same
413 time, the PCI core gained the ability that fakephp provided, namely
414 function-level hot-remove and hot-add.
415
416 Since the PCI core now provides the same functionality, exposed in:
417
418 /sys/bus/pci/rescan
419 /sys/bus/pci/devices/.../remove
420 /sys/bus/pci/devices/.../rescan
421
422 there is no functional reason to maintain fakephp as well.
423
424 We will keep the existing module so that 'modprobe fakephp' will
425 present the old /sys/bus/pci/slots/... interface for compatibility,
426 but users are urged to migrate their applications to the API above.
427
428 After a reasonable transition period, we will remove the legacy
429 fakephp interface.
430Who: Alex Chiang <achiang@hp.com>
431
432---------------------------
433
434What: i2c-voodoo3 driver
435When: October 2009
436Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate
437 driver but this caused driver conflicts.
438Who: Jean Delvare <khali@linux-fr.org>
439 Krzysztof Helt <krzysztof.h1@wp.pl>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 52cd611277a3..8dd6db76171d 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -68,6 +68,8 @@ ncpfs.txt
68 - info on Novell Netware(tm) filesystem using NCP protocol. 68 - info on Novell Netware(tm) filesystem using NCP protocol.
69nfsroot.txt 69nfsroot.txt
70 - short guide on setting up a diskless box with NFS root filesystem. 70 - short guide on setting up a diskless box with NFS root filesystem.
71nilfs2.txt
72 - info and mount options for the NILFS2 filesystem.
71ntfs.txt 73ntfs.txt
72 - info and mount options for the NTFS filesystem (Windows NT). 74 - info and mount options for the NTFS filesystem (Windows NT).
73ocfs2.txt 75ocfs2.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index ec6a9392a173..3120f8dd2c31 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -437,8 +437,11 @@ grab BKL for cases when we close a file that had been opened r/w, but that
437can and should be done using the internal locking with smaller critical areas). 437can and should be done using the internal locking with smaller critical areas).
438Current worst offender is ext2_get_block()... 438Current worst offender is ext2_get_block()...
439 439
440->fasync() is a mess. This area needs a big cleanup and that will probably 440->fasync() is called without BKL protection, and is responsible for
441affect locking. 441maintaining the FASYNC bit in filp->f_flags. Most instances call
442fasync_helper(), which does that maintenance, so it's not normally
443something one needs to worry about. Return values > 0 will be mapped to
444zero in the VFS layer.
442 445
443->readdir() and ->ioctl() on directories must be changed. Ideally we would 446->readdir() and ->ioctl() on directories must be changed. Ideally we would
444move ->readdir() to inode_operations and use a separate method for directory 447move ->readdir() to inode_operations and use a separate method for directory
@@ -502,23 +505,31 @@ prototypes:
502 void (*open)(struct vm_area_struct*); 505 void (*open)(struct vm_area_struct*);
503 void (*close)(struct vm_area_struct*); 506 void (*close)(struct vm_area_struct*);
504 int (*fault)(struct vm_area_struct*, struct vm_fault *); 507 int (*fault)(struct vm_area_struct*, struct vm_fault *);
505 int (*page_mkwrite)(struct vm_area_struct *, struct page *); 508 int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
506 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 509 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
507 510
508locking rules: 511locking rules:
509 BKL mmap_sem PageLocked(page) 512 BKL mmap_sem PageLocked(page)
510open: no yes 513open: no yes
511close: no yes 514close: no yes
512fault: no yes 515fault: no yes can return with page locked
513page_mkwrite: no yes no 516page_mkwrite: no yes can return with page locked
514access: no yes 517access: no yes
515 518
516 ->page_mkwrite() is called when a previously read-only page is 519 ->fault() is called when a previously not present pte is about
517about to become writeable. The file system is responsible for 520to be faulted in. The filesystem must find and return the page associated
518protecting against truncate races. Once appropriate action has been 521with the passed in "pgoff" in the vm_fault structure. If it is possible that
519taking to lock out truncate, the page range should be verified to be 522the page may be truncated and/or invalidated, then the filesystem must lock
520within i_size. The page mapping should also be checked that it is not 523the page, then ensure it is not already truncated (the page lock will block
521NULL. 524subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
525locked. The VM will unlock the page.
526
527 ->page_mkwrite() is called when a previously read-only pte is
528about to become writeable. The filesystem again must ensure that there are
529no truncate/invalidate races, and then return with the page locked. If
530the page has been truncated, the filesystem should not look up a new page
531like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
532will cause the VM to retry the fault.
522 533
523 ->access() is called when get_user_pages() fails in 534 ->access() is called when get_user_pages() fails in
524acces_process_vm(), typically used to debug a process through 535acces_process_vm(), typically used to debug a process through
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644
index 000000000000..382d52cdaf2d
--- /dev/null
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -0,0 +1,658 @@
1 ==========================
2 FS-CACHE CACHE BACKEND API
3 ==========================
4
5The FS-Cache system provides an API by which actual caches can be supplied to
6FS-Cache for it to then serve out to network filesystems and other interested
7parties.
8
9This API is declared in <linux/fscache-cache.h>.
10
11
12====================================
13INITIALISING AND REGISTERING A CACHE
14====================================
15
16To start off, a cache definition must be initialised and registered for each
17cache the backend wants to make available. For instance, CacheFS does this in
18the fill_super() operation on mounting.
19
20The cache definition (struct fscache_cache) should be initialised by calling:
21
22 void fscache_init_cache(struct fscache_cache *cache,
23 struct fscache_cache_ops *ops,
24 const char *idfmt,
25 ...);
26
27Where:
28
29 (*) "cache" is a pointer to the cache definition;
30
31 (*) "ops" is a pointer to the table of operations that the backend supports on
32 this cache; and
33
34 (*) "idfmt" is a format and printf-style arguments for constructing a label
35 for the cache.
36
37
38The cache should then be registered with FS-Cache by passing a pointer to the
39previously initialised cache definition to:
40
41 int fscache_add_cache(struct fscache_cache *cache,
42 struct fscache_object *fsdef,
43 const char *tagname);
44
45Two extra arguments should also be supplied:
46
47 (*) "fsdef" which should point to the object representation for the FS-Cache
48 master index in this cache. Netfs primary index entries will be created
49 here. FS-Cache keeps the caller's reference to the index object if
50 successful and will release it upon withdrawal of the cache.
51
52 (*) "tagname" which, if given, should be a text string naming this cache. If
53 this is NULL, the identifier will be used instead. For CacheFS, the
54 identifier is set to name the underlying block device and the tag can be
55 supplied by mount.
56
57This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
58is already in use. 0 will be returned on success.
59
60
61=====================
62UNREGISTERING A CACHE
63=====================
64
65A cache can be withdrawn from the system by calling this function with a
66pointer to the cache definition:
67
68 void fscache_withdraw_cache(struct fscache_cache *cache);
69
70In CacheFS's case, this is called by put_super().
71
72
73========
74SECURITY
75========
76
77The cache methods are executed one of two contexts:
78
79 (1) that of the userspace process that issued the netfs operation that caused
80 the cache method to be invoked, or
81
82 (2) that of one of the processes in the FS-Cache thread pool.
83
84In either case, this may not be an appropriate context in which to access the
85cache.
86
87The calling process's fsuid, fsgid and SELinux security identities may need to
88be masqueraded for the duration of the cache driver's access to the cache.
89This is left to the cache to handle; FS-Cache makes no effort in this regard.
90
91
92===================================
93CONTROL AND STATISTICS PRESENTATION
94===================================
95
96The cache may present data to the outside world through FS-Cache's interfaces
97in sysfs and procfs - the former for control and the latter for statistics.
98
99A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
100is enabled. This is accessible through the kobject struct fscache_cache::kobj
101and is for use by the cache as it sees fit.
102
103
104========================
105RELEVANT DATA STRUCTURES
106========================
107
108 (*) Index/Data file FS-Cache representation cookie:
109
110 struct fscache_cookie {
111 struct fscache_object_def *def;
112 struct fscache_netfs *netfs;
113 void *netfs_data;
114 ...
115 };
116
117 The fields that might be of use to the backend describe the object
118 definition, the netfs definition and the netfs's data for this cookie.
119 The object definition contain functions supplied by the netfs for loading
120 and matching index entries; these are required to provide some of the
121 cache operations.
122
123
124 (*) In-cache object representation:
125
126 struct fscache_object {
127 int debug_id;
128 enum {
129 FSCACHE_OBJECT_RECYCLING,
130 ...
131 } state;
132 spinlock_t lock
133 struct fscache_cache *cache;
134 struct fscache_cookie *cookie;
135 ...
136 };
137
138 Structures of this type should be allocated by the cache backend and
139 passed to FS-Cache when requested by the appropriate cache operation. In
140 the case of CacheFS, they're embedded in CacheFS's internal object
141 structures.
142
143 The debug_id is a simple integer that can be used in debugging messages
144 that refer to a particular object. In such a case it should be printed
145 using "OBJ%x" to be consistent with FS-Cache.
146
147 Each object contains a pointer to the cookie that represents the object it
148 is backing. An object should retired when put_object() is called if it is
149 in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be
150 initialised by calling fscache_object_init(object).
151
152
153 (*) FS-Cache operation record:
154
155 struct fscache_operation {
156 atomic_t usage;
157 struct fscache_object *object;
158 unsigned long flags;
159 #define FSCACHE_OP_EXCLUSIVE
160 void (*processor)(struct fscache_operation *op);
161 void (*release)(struct fscache_operation *op);
162 ...
163 };
164
165 FS-Cache has a pool of threads that it uses to give CPU time to the
166 various asynchronous operations that need to be done as part of driving
167 the cache. These are represented by the above structure. The processor
168 method is called to give the op CPU time, and the release method to get
169 rid of it when its usage count reaches 0.
170
171 An operation can be made exclusive upon an object by setting the
172 appropriate flag before enqueuing it with fscache_enqueue_operation(). If
173 an operation needs more processing time, it should be enqueued again.
174
175
176 (*) FS-Cache retrieval operation record:
177
178 struct fscache_retrieval {
179 struct fscache_operation op;
180 struct address_space *mapping;
181 struct list_head *to_do;
182 ...
183 };
184
185 A structure of this type is allocated by FS-Cache to record retrieval and
186 allocation requests made by the netfs. This struct is then passed to the
187 backend to do the operation. The backend may get extra refs to it by
188 calling fscache_get_retrieval() and refs may be discarded by calling
189 fscache_put_retrieval().
190
191 A retrieval operation can be used by the backend to do retrieval work. To
192 do this, the retrieval->op.processor method pointer should be set
193 appropriately by the backend and fscache_enqueue_retrieval() called to
194 submit it to the thread pool. CacheFiles, for example, uses this to queue
195 page examination when it detects PG_lock being cleared.
196
197 The to_do field is an empty list available for the cache backend to use as
198 it sees fit.
199
200
201 (*) FS-Cache storage operation record:
202
203 struct fscache_storage {
204 struct fscache_operation op;
205 pgoff_t store_limit;
206 ...
207 };
208
209 A structure of this type is allocated by FS-Cache to record outstanding
210 writes to be made. FS-Cache itself enqueues this operation and invokes
211 the write_page() method on the object at appropriate times to effect
212 storage.
213
214
215================
216CACHE OPERATIONS
217================
218
219The cache backend provides FS-Cache with a table of operations that can be
220performed on the denizens of the cache. These are held in a structure of type:
221
222 struct fscache_cache_ops
223
224 (*) Name of cache provider [mandatory]:
225
226 const char *name
227
228 This isn't strictly an operation, but should be pointed at a string naming
229 the backend.
230
231
232 (*) Allocate a new object [mandatory]:
233
234 struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
235 struct fscache_cookie *cookie)
236
237 This method is used to allocate a cache object representation to back a
238 cookie in a particular cache. fscache_object_init() should be called on
239 the object to initialise it prior to returning.
240
241 This function may also be used to parse the index key to be used for
242 multiple lookup calls to turn it into a more convenient form. FS-Cache
243 will call the lookup_complete() method to allow the cache to release the
244 form once lookup is complete or aborted.
245
246
247 (*) Look up and create object [mandatory]:
248
249 void (*lookup_object)(struct fscache_object *object)
250
251 This method is used to look up an object, given that the object is already
252 allocated and attached to the cookie. This should instantiate that object
253 in the cache if it can.
254
255 The method should call fscache_object_lookup_negative() as soon as
256 possible if it determines the object doesn't exist in the cache. If the
257 object is found to exist and the netfs indicates that it is valid then
258 fscache_obtained_object() should be called once the object is in a
259 position to have data stored in it. Similarly, fscache_obtained_object()
260 should also be called once a non-present object has been created.
261
262 If a lookup error occurs, fscache_object_lookup_error() should be called
263 to abort the lookup of that object.
264
265
266 (*) Release lookup data [mandatory]:
267
268 void (*lookup_complete)(struct fscache_object *object)
269
270 This method is called to ask the cache to release any resources it was
271 using to perform a lookup.
272
273
274 (*) Increment object refcount [mandatory]:
275
276 struct fscache_object *(*grab_object)(struct fscache_object *object)
277
278 This method is called to increment the reference count on an object. It
279 may fail (for instance if the cache is being withdrawn) by returning NULL.
280 It should return the object pointer if successful.
281
282
283 (*) Lock/Unlock object [mandatory]:
284
285 void (*lock_object)(struct fscache_object *object)
286 void (*unlock_object)(struct fscache_object *object)
287
288 These methods are used to exclusively lock an object. It must be possible
289 to schedule with the lock held, so a spinlock isn't sufficient.
290
291
292 (*) Pin/Unpin object [optional]:
293
294 int (*pin_object)(struct fscache_object *object)
295 void (*unpin_object)(struct fscache_object *object)
296
297 These methods are used to pin an object into the cache. Once pinned an
298 object cannot be reclaimed to make space. Return -ENOSPC if there's not
299 enough space in the cache to permit this.
300
301
302 (*) Update object [mandatory]:
303
304 int (*update_object)(struct fscache_object *object)
305
306 This is called to update the index entry for the specified object. The
307 new information should be in object->cookie->netfs_data. This can be
308 obtained by calling object->cookie->def->get_aux()/get_attr().
309
310
311 (*) Discard object [mandatory]:
312
313 void (*drop_object)(struct fscache_object *object)
314
315 This method is called to indicate that an object has been unbound from its
316 cookie, and that the cache should release the object's resources and
317 retire it if it's in state FSCACHE_OBJECT_RECYCLING.
318
319 This method should not attempt to release any references held by the
320 caller. The caller will invoke the put_object() method as appropriate.
321
322
323 (*) Release object reference [mandatory]:
324
325 void (*put_object)(struct fscache_object *object)
326
327 This method is used to discard a reference to an object. The object may
328 be freed when all the references to it are released.
329
330
331 (*) Synchronise a cache [mandatory]:
332
333 void (*sync)(struct fscache_cache *cache)
334
335 This is called to ask the backend to synchronise a cache with its backing
336 device.
337
338
339 (*) Dissociate a cache [mandatory]:
340
341 void (*dissociate_pages)(struct fscache_cache *cache)
342
343 This is called to ask a cache to perform any page dissociations as part of
344 cache withdrawal.
345
346
347 (*) Notification that the attributes on a netfs file changed [mandatory]:
348
349 int (*attr_changed)(struct fscache_object *object);
350
351 This is called to indicate to the cache that certain attributes on a netfs
352 file have changed (for example the maximum size a file may reach). The
353 cache can read these from the netfs by calling the cookie's get_attr()
354 method.
355
356 The cache may use the file size information to reserve space on the cache.
357 It should also call fscache_set_store_limit() to indicate to FS-Cache the
358 highest byte it's willing to store for an object.
359
360 This method may return -ve if an error occurred or the cache object cannot
361 be expanded. In such a case, the object will be withdrawn from service.
362
363 This operation is run asynchronously from FS-Cache's thread pool, and
364 storage and retrieval operations from the netfs are excluded during the
365 execution of this operation.
366
367
368 (*) Reserve cache space for an object's data [optional]:
369
370 int (*reserve_space)(struct fscache_object *object, loff_t size);
371
372 This is called to request that cache space be reserved to hold the data
373 for an object and the metadata used to track it. Zero size should be
374 taken as request to cancel a reservation.
375
376 This should return 0 if successful, -ENOSPC if there isn't enough space
377 available, or -ENOMEM or -EIO on other errors.
378
379 The reservation may exceed the current size of the object, thus permitting
380 future expansion. If the amount of space consumed by an object would
381 exceed the reservation, it's permitted to refuse requests to allocate
382 pages, but not required. An object may be pruned down to its reservation
383 size if larger than that already.
384
385
386 (*) Request page be read from cache [mandatory]:
387
388 int (*read_or_alloc_page)(struct fscache_retrieval *op,
389 struct page *page,
390 gfp_t gfp)
391
392 This is called to attempt to read a netfs page from the cache, or to
393 reserve a backing block if not. FS-Cache will have done as much checking
394 as it can before calling, but most of the work belongs to the backend.
395
396 If there's no page in the cache, then -ENODATA should be returned if the
397 backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
398 didn't.
399
400 If there is suitable data in the cache, then a read operation should be
401 queued and 0 returned. When the read finishes, fscache_end_io() should be
402 called.
403
404 The fscache_mark_pages_cached() should be called for the page if any cache
405 metadata is retained. This will indicate to the netfs that the page needs
406 explicit uncaching. This operation takes a pagevec, thus allowing several
407 pages to be marked at once.
408
409 The retrieval record pointed to by op should be retained for each page
410 queued and released when I/O on the page has been formally ended.
411 fscache_get/put_retrieval() are available for this purpose.
412
413 The retrieval record may be used to get CPU time via the FS-Cache thread
414 pool. If this is desired, the op->op.processor should be set to point to
415 the appropriate processing routine, and fscache_enqueue_retrieval() should
416 be called at an appropriate point to request CPU time. For instance, the
417 retrieval routine could be enqueued upon the completion of a disk read.
418 The to_do field in the retrieval record is provided to aid in this.
419
420 If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
421 returned if possible or fscache_end_io() called with a suitable error
422 code..
423
424
425 (*) Request pages be read from cache [mandatory]:
426
427 int (*read_or_alloc_pages)(struct fscache_retrieval *op,
428 struct list_head *pages,
429 unsigned *nr_pages,
430 gfp_t gfp)
431
432 This is like the read_or_alloc_page() method, except it is handed a list
433 of pages instead of one page. Any pages on which a read operation is
434 started must be added to the page cache for the specified mapping and also
435 to the LRU. Such pages must also be removed from the pages list and
436 *nr_pages decremented per page.
437
438 If there was an error such as -ENOMEM, then that should be returned; else
439 if one or more pages couldn't be read or allocated, then -ENOBUFS should
440 be returned; else if one or more pages couldn't be read, then -ENODATA
441 should be returned. If all the pages are dispatched then 0 should be
442 returned.
443
444
445 (*) Request page be allocated in the cache [mandatory]:
446
447 int (*allocate_page)(struct fscache_retrieval *op,
448 struct page *page,
449 gfp_t gfp)
450
451 This is like the read_or_alloc_page() method, except that it shouldn't
452 read from the cache, even if there's data there that could be retrieved.
453 It should, however, set up any internal metadata required such that
454 the write_page() method can write to the cache.
455
456 If there's no backing block available, then -ENOBUFS should be returned
457 (or -ENOMEM if there were other problems). If a block is successfully
458 allocated, then the netfs page should be marked and 0 returned.
459
460
461 (*) Request pages be allocated in the cache [mandatory]:
462
463 int (*allocate_pages)(struct fscache_retrieval *op,
464 struct list_head *pages,
465 unsigned *nr_pages,
466 gfp_t gfp)
467
468 This is an multiple page version of the allocate_page() method. pages and
469 nr_pages should be treated as for the read_or_alloc_pages() method.
470
471
472 (*) Request page be written to cache [mandatory]:
473
474 int (*write_page)(struct fscache_storage *op,
475 struct page *page);
476
477 This is called to write from a page on which there was a previously
478 successful read_or_alloc_page() call or similar. FS-Cache filters out
479 pages that don't have mappings.
480
481 This method is called asynchronously from the FS-Cache thread pool. It is
482 not required to actually store anything, provided -ENODATA is then
483 returned to the next read of this page.
484
485 If an error occurred, then a negative error code should be returned,
486 otherwise zero should be returned. FS-Cache will take appropriate action
487 in response to an error, such as withdrawing this object.
488
489 If this method returns success then FS-Cache will inform the netfs
490 appropriately.
491
492
493 (*) Discard retained per-page metadata [mandatory]:
494
495 void (*uncache_page)(struct fscache_object *object, struct page *page)
496
497 This is called when a netfs page is being evicted from the pagecache. The
498 cache backend should tear down any internal representation or tracking it
499 maintains for this page.
500
501
502==================
503FS-CACHE UTILITIES
504==================
505
506FS-Cache provides some utilities that a cache backend may make use of:
507
508 (*) Note occurrence of an I/O error in a cache:
509
510 void fscache_io_error(struct fscache_cache *cache)
511
512 This tells FS-Cache that an I/O error occurred in the cache. After this
513 has been called, only resource dissociation operations (object and page
514 release) will be passed from the netfs to the cache backend for the
515 specified cache.
516
517 This does not actually withdraw the cache. That must be done separately.
518
519
520 (*) Invoke the retrieval I/O completion function:
521
522 void fscache_end_io(struct fscache_retrieval *op, struct page *page,
523 int error);
524
525 This is called to note the end of an attempt to retrieve a page. The
526 error value should be 0 if successful and an error otherwise.
527
528
529 (*) Set highest store limit:
530
531 void fscache_set_store_limit(struct fscache_object *object,
532 loff_t i_size);
533
534 This sets the limit FS-Cache imposes on the highest byte it's willing to
535 try and store for a netfs. Any page over this limit is automatically
536 rejected by fscache_read_alloc_page() and co with -ENOBUFS.
537
538
539 (*) Mark pages as being cached:
540
541 void fscache_mark_pages_cached(struct fscache_retrieval *op,
542 struct pagevec *pagevec);
543
544 This marks a set of pages as being cached. After this has been called,
545 the netfs must call fscache_uncache_page() to unmark the pages.
546
547
548 (*) Perform coherency check on an object:
549
550 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
551 const void *data,
552 uint16_t datalen);
553
554 This asks the netfs to perform a coherency check on an object that has
555 just been looked up. The cookie attached to the object will determine the
556 netfs to use. data and datalen should specify where the auxiliary data
557 retrieved from the cache can be found.
558
559 One of three values will be returned:
560
561 (*) FSCACHE_CHECKAUX_OKAY
562
563 The coherency data indicates the object is valid as is.
564
565 (*) FSCACHE_CHECKAUX_NEEDS_UPDATE
566
567 The coherency data needs updating, but otherwise the object is
568 valid.
569
570 (*) FSCACHE_CHECKAUX_OBSOLETE
571
572 The coherency data indicates that the object is obsolete and should
573 be discarded.
574
575
576 (*) Initialise a freshly allocated object:
577
578 void fscache_object_init(struct fscache_object *object);
579
580 This initialises all the fields in an object representation.
581
582
583 (*) Indicate the destruction of an object:
584
585 void fscache_object_destroyed(struct fscache_cache *cache);
586
587 This must be called to inform FS-Cache that an object that belonged to a
588 cache has been destroyed and deallocated. This will allow continuation
589 of the cache withdrawal process when it is stopped pending destruction of
590 all the objects.
591
592
593 (*) Indicate negative lookup on an object:
594
595 void fscache_object_lookup_negative(struct fscache_object *object);
596
597 This is called to indicate to FS-Cache that a lookup process for an object
598 found a negative result.
599
600 This changes the state of an object to permit reads pending on lookup
601 completion to go off and start fetching data from the netfs server as it's
602 known at this point that there can't be any data in the cache.
603
604 This may be called multiple times on an object. Only the first call is
605 significant - all subsequent calls are ignored.
606
607
608 (*) Indicate an object has been obtained:
609
610 void fscache_obtained_object(struct fscache_object *object);
611
612 This is called to indicate to FS-Cache that a lookup process for an object
613 produced a positive result, or that an object was created. This should
614 only be called once for any particular object.
615
616 This changes the state of an object to indicate:
617
618 (1) if no call to fscache_object_lookup_negative() has been made on
619 this object, that there may be data available, and that reads can
620 now go and look for it; and
621
622 (2) that writes may now proceed against this object.
623
624
625 (*) Indicate that object lookup failed:
626
627 void fscache_object_lookup_error(struct fscache_object *object);
628
629 This marks an object as having encountered a fatal error (usually EIO)
630 and causes it to move into a state whereby it will be withdrawn as soon
631 as possible.
632
633
634 (*) Get and release references on a retrieval record:
635
636 void fscache_get_retrieval(struct fscache_retrieval *op);
637 void fscache_put_retrieval(struct fscache_retrieval *op);
638
639 These two functions are used to retain a retrieval record whilst doing
640 asynchronous data retrieval and block allocation.
641
642
643 (*) Enqueue a retrieval record for processing.
644
645 void fscache_enqueue_retrieval(struct fscache_retrieval *op);
646
647 This enqueues a retrieval record for processing by the FS-Cache thread
648 pool. One of the threads in the pool will invoke the retrieval record's
649 op->op.processor callback function. This function may be called from
650 within the callback function.
651
652
653 (*) List of object state names:
654
655 const char *fscache_object_states[];
656
657 For debugging purposes, this may be used to turn the state that an object
658 is in into a text string for display purposes.
diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt
new file mode 100644
index 000000000000..748a1ae49e12
--- /dev/null
+++ b/Documentation/filesystems/caching/cachefiles.txt
@@ -0,0 +1,501 @@
1 ===============================================
2 CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM
3 ===============================================
4
5Contents:
6
7 (*) Overview.
8
9 (*) Requirements.
10
11 (*) Configuration.
12
13 (*) Starting the cache.
14
15 (*) Things to avoid.
16
17 (*) Cache culling.
18
19 (*) Cache structure.
20
21 (*) Security model and SELinux.
22
23 (*) A note on security.
24
25 (*) Statistical information.
26
27 (*) Debugging.
28
29
30========
31OVERVIEW
32========
33
34CacheFiles is a caching backend that's meant to use as a cache a directory on
35an already mounted filesystem of a local type (such as Ext3).
36
37CacheFiles uses a userspace daemon to do some of the cache management - such as
38reaping stale nodes and culling. This is called cachefilesd and lives in
39/sbin.
40
41The filesystem and data integrity of the cache are only as good as those of the
42filesystem providing the backing services. Note that CacheFiles does not
43attempt to journal anything since the journalling interfaces of the various
44filesystems are very specific in nature.
45
46CacheFiles creates a misc character device - "/dev/cachefiles" - that is used
47to communication with the daemon. Only one thing may have this open at once,
48and whilst it is open, a cache is at least partially in existence. The daemon
49opens this and sends commands down it to control the cache.
50
51CacheFiles is currently limited to a single cache.
52
53CacheFiles attempts to maintain at least a certain percentage of free space on
54the filesystem, shrinking the cache by culling the objects it contains to make
55space if necessary - see the "Cache Culling" section. This means it can be
56placed on the same medium as a live set of data, and will expand to make use of
57spare space and automatically contract when the set of data requires more
58space.
59
60
61============
62REQUIREMENTS
63============
64
65The use of CacheFiles and its daemon requires the following features to be
66available in the system and in the cache filesystem:
67
68 - dnotify.
69
70 - extended attributes (xattrs).
71
72 - openat() and friends.
73
74 - bmap() support on files in the filesystem (FIBMAP ioctl).
75
76 - The use of bmap() to detect a partial page at the end of the file.
77
78It is strongly recommended that the "dir_index" option is enabled on Ext3
79filesystems being used as a cache.
80
81
82=============
83CONFIGURATION
84=============
85
86The cache is configured by a script in /etc/cachefilesd.conf. These commands
87set up cache ready for use. The following script commands are available:
88
89 (*) brun <N>%
90 (*) bcull <N>%
91 (*) bstop <N>%
92 (*) frun <N>%
93 (*) fcull <N>%
94 (*) fstop <N>%
95
96 Configure the culling limits. Optional. See the section on culling
97 The defaults are 7% (run), 5% (cull) and 1% (stop) respectively.
98
99 The commands beginning with a 'b' are file space (block) limits, those
100 beginning with an 'f' are file count limits.
101
102 (*) dir <path>
103
104 Specify the directory containing the root of the cache. Mandatory.
105
106 (*) tag <name>
107
108 Specify a tag to FS-Cache to use in distinguishing multiple caches.
109 Optional. The default is "CacheFiles".
110
111 (*) debug <mask>
112
113 Specify a numeric bitmask to control debugging in the kernel module.
114 Optional. The default is zero (all off). The following values can be
115 OR'd into the mask to collect various information:
116
117 1 Turn on trace of function entry (_enter() macros)
118 2 Turn on trace of function exit (_leave() macros)
119 4 Turn on trace of internal debug points (_debug())
120
121 This mask can also be set through sysfs, eg:
122
123 echo 5 >/sys/modules/cachefiles/parameters/debug
124
125
126==================
127STARTING THE CACHE
128==================
129
130The cache is started by running the daemon. The daemon opens the cache device,
131configures the cache and tells it to begin caching. At that point the cache
132binds to fscache and the cache becomes live.
133
134The daemon is run as follows:
135
136 /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>]
137
138The flags are:
139
140 (*) -d
141
142 Increase the debugging level. This can be specified multiple times and
143 is cumulative with itself.
144
145 (*) -s
146
147 Send messages to stderr instead of syslog.
148
149 (*) -n
150
151 Don't daemonise and go into background.
152
153 (*) -f <configfile>
154
155 Use an alternative configuration file rather than the default one.
156
157
158===============
159THINGS TO AVOID
160===============
161
162Do not mount other things within the cache as this will cause problems. The
163kernel module contains its own very cut-down path walking facility that ignores
164mountpoints, but the daemon can't avoid them.
165
166Do not create, rename or unlink files and directories in the cache whilst the
167cache is active, as this may cause the state to become uncertain.
168
169Renaming files in the cache might make objects appear to be other objects (the
170filename is part of the lookup key).
171
172Do not change or remove the extended attributes attached to cache files by the
173cache as this will cause the cache state management to get confused.
174
175Do not create files or directories in the cache, lest the cache get confused or
176serve incorrect data.
177
178Do not chmod files in the cache. The module creates things with minimal
179permissions to prevent random users being able to access them directly.
180
181
182=============
183CACHE CULLING
184=============
185
186The cache may need culling occasionally to make space. This involves
187discarding objects from the cache that have been used less recently than
188anything else. Culling is based on the access time of data objects. Empty
189directories are culled if not in use.
190
191Cache culling is done on the basis of the percentage of blocks and the
192percentage of files available in the underlying filesystem. There are six
193"limits":
194
195 (*) brun
196 (*) frun
197
198 If the amount of free space and the number of available files in the cache
199 rises above both these limits, then culling is turned off.
200
201 (*) bcull
202 (*) fcull
203
204 If the amount of available space or the number of available files in the
205 cache falls below either of these limits, then culling is started.
206
207 (*) bstop
208 (*) fstop
209
210 If the amount of available space or the number of available files in the
211 cache falls below either of these limits, then no further allocation of
212 disk space or files is permitted until culling has raised things above
213 these limits again.
214
215These must be configured thusly:
216
217 0 <= bstop < bcull < brun < 100
218 0 <= fstop < fcull < frun < 100
219
220Note that these are percentages of available space and available files, and do
221_not_ appear as 100 minus the percentage displayed by the "df" program.
222
223The userspace daemon scans the cache to build up a table of cullable objects.
224These are then culled in least recently used order. A new scan of the cache is
225started as soon as space is made in the table. Objects will be skipped if
226their atimes have changed or if the kernel module says it is still using them.
227
228
229===============
230CACHE STRUCTURE
231===============
232
233The CacheFiles module will create two directories in the directory it was
234given:
235
236 (*) cache/
237
238 (*) graveyard/
239
240The active cache objects all reside in the first directory. The CacheFiles
241kernel module moves any retired or culled objects that it can't simply unlink
242to the graveyard from which the daemon will actually delete them.
243
244The daemon uses dnotify to monitor the graveyard directory, and will delete
245anything that appears therein.
246
247
248The module represents index objects as directories with the filename "I..." or
249"J...". Note that the "cache/" directory is itself a special index.
250
251Data objects are represented as files if they have no children, or directories
252if they do. Their filenames all begin "D..." or "E...". If represented as a
253directory, data objects will have a file in the directory called "data" that
254actually holds the data.
255
256Special objects are similar to data objects, except their filenames begin
257"S..." or "T...".
258
259
260If an object has children, then it will be represented as a directory.
261Immediately in the representative directory are a collection of directories
262named for hash values of the child object keys with an '@' prepended. Into
263this directory, if possible, will be placed the representations of the child
264objects:
265
266 INDEX INDEX INDEX DATA FILES
267 ========= ========== ================================= ================
268 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400
269 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry
270 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry
271 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry
272
273
274If the key is so long that it exceeds NAME_MAX with the decorations added on to
275it, then it will be cut into pieces, the first few of which will be used to
276make a nest of directories, and the last one of which will be the objects
277inside the last directory. The names of the intermediate directories will have
278'+' prepended:
279
280 J1223/@23/+xy...z/+kl...m/Epqr
281
282
283Note that keys are raw data, and not only may they exceed NAME_MAX in size,
284they may also contain things like '/' and NUL characters, and so they may not
285be suitable for turning directly into a filename.
286
287To handle this, CacheFiles will use a suitably printable filename directly and
288"base-64" encode ones that aren't directly suitable. The two versions of
289object filenames indicate the encoding:
290
291 OBJECT TYPE PRINTABLE ENCODED
292 =============== =============== ===============
293 Index "I..." "J..."
294 Data "D..." "E..."
295 Special "S..." "T..."
296
297Intermediate directories are always "@" or "+" as appropriate.
298
299
300Each object in the cache has an extended attribute label that holds the object
301type ID (required to distinguish special objects) and the auxiliary data from
302the netfs. The latter is used to detect stale objects in the cache and update
303or retire them.
304
305
306Note that CacheFiles will erase from the cache any file it doesn't recognise or
307any file of an incorrect type (such as a FIFO file or a device file).
308
309
310==========================
311SECURITY MODEL AND SELINUX
312==========================
313
314CacheFiles is implemented to deal properly with the LSM security features of
315the Linux kernel and the SELinux facility.
316
317One of the problems that CacheFiles faces is that it is generally acting on
318behalf of a process, and running in that process's context, and that includes a
319security context that is not appropriate for accessing the cache - either
320because the files in the cache are inaccessible to that process, or because if
321the process creates a file in the cache, that file may be inaccessible to other
322processes.
323
324The way CacheFiles works is to temporarily change the security context (fsuid,
325fsgid and actor security label) that the process acts as - without changing the
326security context of the process when it the target of an operation performed by
327some other process (so signalling and suchlike still work correctly).
328
329
330When the CacheFiles module is asked to bind to its cache, it:
331
332 (1) Finds the security label attached to the root cache directory and uses
333 that as the security label with which it will create files. By default,
334 this is:
335
336 cachefiles_var_t
337
338 (2) Finds the security label of the process which issued the bind request
339 (presumed to be the cachefilesd daemon), which by default will be:
340
341 cachefilesd_t
342
343 and asks LSM to supply a security ID as which it should act given the
344 daemon's label. By default, this will be:
345
346 cachefiles_kernel_t
347
348 SELinux transitions the daemon's security ID to the module's security ID
349 based on a rule of this form in the policy.
350
351 type_transition <daemon's-ID> kernel_t : process <module's-ID>;
352
353 For instance:
354
355 type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t;
356
357
358The module's security ID gives it permission to create, move and remove files
359and directories in the cache, to find and access directories and files in the
360cache, to set and access extended attributes on cache objects, and to read and
361write files in the cache.
362
363The daemon's security ID gives it only a very restricted set of permissions: it
364may scan directories, stat files and erase files and directories. It may
365not read or write files in the cache, and so it is precluded from accessing the
366data cached therein; nor is it permitted to create new files in the cache.
367
368
369There are policy source files available in:
370
371 http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2
372
373and later versions. In that tarball, see the files:
374
375 cachefilesd.te
376 cachefilesd.fc
377 cachefilesd.if
378
379They are built and installed directly by the RPM.
380
381If a non-RPM based system is being used, then copy the above files to their own
382directory and run:
383
384 make -f /usr/share/selinux/devel/Makefile
385 semodule -i cachefilesd.pp
386
387You will need checkpolicy and selinux-policy-devel installed prior to the
388build.
389
390
391By default, the cache is located in /var/fscache, but if it is desirable that
392it should be elsewhere, than either the above policy files must be altered, or
393an auxiliary policy must be installed to label the alternate location of the
394cache.
395
396For instructions on how to add an auxiliary policy to enable the cache to be
397located elsewhere when SELinux is in enforcing mode, please see:
398
399 /usr/share/doc/cachefilesd-*/move-cache.txt
400
401When the cachefilesd rpm is installed; alternatively, the document can be found
402in the sources.
403
404
405==================
406A NOTE ON SECURITY
407==================
408
409CacheFiles makes use of the split security in the task_struct. It allocates
410its own task_security structure, and redirects current->cred to point to it
411when it acts on behalf of another process, in that process's context.
412
413The reason it does this is that it calls vfs_mkdir() and suchlike rather than
414bypassing security and calling inode ops directly. Therefore the VFS and LSM
415may deny the CacheFiles access to the cache data because under some
416circumstances the caching code is running in the security context of whatever
417process issued the original syscall on the netfs.
418
419Furthermore, should CacheFiles create a file or directory, the security
420parameters with that object is created (UID, GID, security label) would be
421derived from that process that issued the system call, thus potentially
422preventing other processes from accessing the cache - including CacheFiles's
423cache management daemon (cachefilesd).
424
425What is required is to temporarily override the security of the process that
426issued the system call. We can't, however, just do an in-place change of the
427security data as that affects the process as an object, not just as a subject.
428This means it may lose signals or ptrace events for example, and affects what
429the process looks like in /proc.
430
431So CacheFiles makes use of a logical split in the security between the
432objective security (task->real_cred) and the subjective security (task->cred).
433The objective security holds the intrinsic security properties of a process and
434is never overridden. This is what appears in /proc, and is what is used when a
435process is the target of an operation by some other process (SIGKILL for
436example).
437
438The subjective security holds the active security properties of a process, and
439may be overridden. This is not seen externally, and is used whan a process
440acts upon another object, for example SIGKILLing another process or opening a
441file.
442
443LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request
444for CacheFiles to run in a context of a specific security label, or to create
445files and directories with another security label.
446
447
448=======================
449STATISTICAL INFORMATION
450=======================
451
452If FS-Cache is compiled with the following option enabled:
453
454 CONFIG_CACHEFILES_HISTOGRAM=y
455
456then it will gather certain statistics and display them through a proc file.
457
458 (*) /proc/fs/cachefiles/histogram
459
460 cat /proc/fs/cachefiles/histogram
461 JIFS SECS LOOKUPS MKDIRS CREATES
462 ===== ===== ========= ========= =========
463
464 This shows the breakdown of the number of times each amount of time
465 between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
466 columns are as follows:
467
468 COLUMN TIME MEASUREMENT
469 ======= =======================================================
470 LOOKUPS Length of time to perform a lookup on the backing fs
471 MKDIRS Length of time to perform a mkdir on the backing fs
472 CREATES Length of time to perform a create on the backing fs
473
474 Each row shows the number of events that took a particular range of times.
475 Each step is 1 jiffy in size. The JIFS column indicates the particular
476 jiffy range covered, and the SECS field the equivalent number of seconds.
477
478
479=========
480DEBUGGING
481=========
482
483If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime
484debugging enabled by adjusting the value in:
485
486 /sys/module/cachefiles/parameters/debug
487
488This is a bitmask of debugging streams to enable:
489
490 BIT VALUE STREAM POINT
491 ======= ======= =============================== =======================
492 0 1 General Function entry trace
493 1 2 Function exit trace
494 2 4 General
495
496The appropriate set of values should be OR'd together and the result written to
497the control file. For example:
498
499 echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
500
501will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644
index 000000000000..9e94b9491d89
--- /dev/null
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -0,0 +1,333 @@
1 ==========================
2 General Filesystem Caching
3 ==========================
4
5========
6OVERVIEW
7========
8
9This facility is a general purpose cache for network filesystems, though it
10could be used for caching other things such as ISO9660 filesystems too.
11
12FS-Cache mediates between cache backends (such as CacheFS) and network
13filesystems:
14
15 +---------+
16 | | +--------------+
17 | NFS |--+ | |
18 | | | +-->| CacheFS |
19 +---------+ | +----------+ | | /dev/hda5 |
20 | | | | +--------------+
21 +---------+ +-->| | |
22 | | | |--+
23 | AFS |----->| FS-Cache |
24 | | | |--+
25 +---------+ +-->| | |
26 | | | | +--------------+
27 +---------+ | +----------+ | | |
28 | | | +-->| CacheFiles |
29 | ISOFS |--+ | /var/cache |
30 | | +--------------+
31 +---------+
32
33Or to look at it another way, FS-Cache is a module that provides a caching
34facility to a network filesystem such that the cache is transparent to the
35user:
36
37 +---------+
38 | |
39 | Server |
40 | |
41 +---------+
42 | NETWORK
43 ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44 |
45 | +----------+
46 V | |
47 +---------+ | |
48 | | | |
49 | NFS |----->| FS-Cache |
50 | | | |--+
51 +---------+ | | | +--------------+ +--------------+
52 | | | | | | | |
53 V +----------+ +-->| CacheFiles |-->| Ext3 |
54 +---------+ | /var/cache | | /dev/sda6 |
55 | | +--------------+ +--------------+
56 | VFS | ^ ^
57 | | | |
58 +---------+ +--------------+ |
59 | KERNEL SPACE | |
60 ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
61 | USER SPACE | |
62 V | |
63 +---------+ +--------------+
64 | | | |
65 | Process | | cachefilesd |
66 | | | |
67 +---------+ +--------------+
68
69
70FS-Cache does not follow the idea of completely loading every netfs file
71opened in its entirety into a cache before permitting it to be accessed and
72then serving the pages out of that cache rather than the netfs inode because:
73
74 (1) It must be practical to operate without a cache.
75
76 (2) The size of any accessible file must not be limited to the size of the
77 cache.
78
79 (3) The combined size of all opened files (this includes mapped libraries)
80 must not be limited to the size of the cache.
81
82 (4) The user should not be forced to download an entire file just to do a
83 one-off access of a small portion of it (such as might be done with the
84 "file" program).
85
86It instead serves the cache out in PAGE_SIZE chunks as and when requested by
87the netfs('s) using it.
88
89
90FS-Cache provides the following facilities:
91
92 (1) More than one cache can be used at once. Caches can be selected
93 explicitly by use of tags.
94
95 (2) Caches can be added / removed at any time.
96
97 (3) The netfs is provided with an interface that allows either party to
98 withdraw caching facilities from a file (required for (2)).
99
100 (4) The interface to the netfs returns as few errors as possible, preferring
101 rather to let the netfs remain oblivious.
102
103 (5) Cookies are used to represent indices, files and other objects to the
104 netfs. The simplest cookie is just a NULL pointer - indicating nothing
105 cached there.
106
107 (6) The netfs is allowed to propose - dynamically - any index hierarchy it
108 desires, though it must be aware that the index search function is
109 recursive, stack space is limited, and indices can only be children of
110 indices.
111
112 (7) Data I/O is done direct to and from the netfs's pages. The netfs
113 indicates that page A is at index B of the data-file represented by cookie
114 C, and that it should be read or written. The cache backend may or may
115 not start I/O on that page, but if it does, a netfs callback will be
116 invoked to indicate completion. The I/O may be either synchronous or
117 asynchronous.
118
119 (8) Cookies can be "retired" upon release. At this point FS-Cache will mark
120 them as obsolete and the index hierarchy rooted at that point will get
121 recycled.
122
123 (9) The netfs provides a "match" function for index searches. In addition to
124 saying whether a match was made or not, this can also specify that an
125 entry should be updated or deleted.
126
127(10) As much as possible is done asynchronously.
128
129
130FS-Cache maintains a virtual indexing tree in which all indices, files, objects
131and pages are kept. Bits of this tree may actually reside in one or more
132caches.
133
134 FSDEF
135 |
136 +------------------------------------+
137 | |
138 NFS AFS
139 | |
140 +--------------------------+ +-----------+
141 | | | |
142 homedir mirror afs.org redhat.com
143 | | |
144 +------------+ +---------------+ +----------+
145 | | | | | |
146 00001 00002 00007 00125 vol00001 vol00002
147 | | | | |
148 +---+---+ +-----+ +---+ +------+------+ +-----+----+
149 | | | | | | | | | | | | |
150PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak
151 | |
152 PG0 +-------+
153 | |
154 00001 00003
155 |
156 +---+---+
157 | | |
158 PG0 PG1 PG2
159
160In the example above, you can see two netfs's being backed: NFS and AFS. These
161have different index hierarchies:
162
163 (*) The NFS primary index contains per-server indices. Each server index is
164 indexed by NFS file handles to get data file objects. Each data file
165 objects can have an array of pages, but may also have further child
166 objects, such as extended attributes and directory entries. Extended
167 attribute objects themselves have page-array contents.
168
169 (*) The AFS primary index contains per-cell indices. Each cell index contains
170 per-logical-volume indices. Each of volume index contains up to three
171 indices for the read-write, read-only and backup mirrors of those volumes.
172 Each of these contains vnode data file objects, each of which contains an
173 array of pages.
174
175The very top index is the FS-Cache master index in which individual netfs's
176have entries.
177
178Any index object may reside in more than one cache, provided it only has index
179children. Any index with non-index object children will be assumed to only
180reside in one cache.
181
182
183The netfs API to FS-Cache can be found in:
184
185 Documentation/filesystems/caching/netfs-api.txt
186
187The cache backend API to FS-Cache can be found in:
188
189 Documentation/filesystems/caching/backend-api.txt
190
191A description of the internal representations and object state machine can be
192found in:
193
194 Documentation/filesystems/caching/object.txt
195
196
197=======================
198STATISTICAL INFORMATION
199=======================
200
201If FS-Cache is compiled with the following options enabled:
202
203 CONFIG_FSCACHE_STATS=y
204 CONFIG_FSCACHE_HISTOGRAM=y
205
206then it will gather certain statistics and display them through a number of
207proc files.
208
209 (*) /proc/fs/fscache/stats
210
211 This shows counts of a number of events that can happen in FS-Cache:
212
213 CLASS EVENT MEANING
214 ======= ======= =======================================================
215 Cookies idx=N Number of index cookies allocated
216 dat=N Number of data storage cookies allocated
217 spc=N Number of special cookies allocated
218 Objects alc=N Number of objects allocated
219 nal=N Number of object allocation failures
220 avl=N Number of objects that reached the available state
221 ded=N Number of objects that reached the dead state
222 ChkAux non=N Number of objects that didn't have a coherency check
223 ok=N Number of objects that passed a coherency check
224 upd=N Number of objects that needed a coherency data update
225 obs=N Number of objects that were declared obsolete
226 Pages mrk=N Number of pages marked as being cached
227 unc=N Number of uncache page requests seen
228 Acquire n=N Number of acquire cookie requests seen
229 nul=N Number of acq reqs given a NULL parent
230 noc=N Number of acq reqs rejected due to no cache available
231 ok=N Number of acq reqs succeeded
232 nbf=N Number of acq reqs rejected due to error
233 oom=N Number of acq reqs failed on ENOMEM
234 Lookups n=N Number of lookup calls made on cache backends
235 neg=N Number of negative lookups made
236 pos=N Number of positive lookups made
237 crt=N Number of objects created by lookup
238 Updates n=N Number of update cookie requests seen
239 nul=N Number of upd reqs given a NULL parent
240 run=N Number of upd reqs granted CPU time
241 Relinqs n=N Number of relinquish cookie requests seen
242 nul=N Number of rlq reqs given a NULL parent
243 wcr=N Number of rlq reqs waited on completion of creation
244 AttrChg n=N Number of attribute changed requests seen
245 ok=N Number of attr changed requests queued
246 nbf=N Number of attr changed rejected -ENOBUFS
247 oom=N Number of attr changed failed -ENOMEM
248 run=N Number of attr changed ops given CPU time
249 Allocs n=N Number of allocation requests seen
250 ok=N Number of successful alloc reqs
251 wt=N Number of alloc reqs that waited on lookup completion
252 nbf=N Number of alloc reqs rejected -ENOBUFS
253 ops=N Number of alloc reqs submitted
254 owt=N Number of alloc reqs waited for CPU time
255 Retrvls n=N Number of retrieval (read) requests seen
256 ok=N Number of successful retr reqs
257 wt=N Number of retr reqs that waited on lookup completion
258 nod=N Number of retr reqs returned -ENODATA
259 nbf=N Number of retr reqs rejected -ENOBUFS
260 int=N Number of retr reqs aborted -ERESTARTSYS
261 oom=N Number of retr reqs failed -ENOMEM
262 ops=N Number of retr reqs submitted
263 owt=N Number of retr reqs waited for CPU time
264 Stores n=N Number of storage (write) requests seen
265 ok=N Number of successful store reqs
266 agn=N Number of store reqs on a page already pending storage
267 nbf=N Number of store reqs rejected -ENOBUFS
268 oom=N Number of store reqs failed -ENOMEM
269 ops=N Number of store reqs submitted
270 run=N Number of store reqs granted CPU time
271 Ops pend=N Number of times async ops added to pending queues
272 run=N Number of times async ops given CPU time
273 enq=N Number of times async ops queued for processing
274 dfr=N Number of async ops queued for deferred release
275 rel=N Number of async ops released
276 gc=N Number of deferred-release async ops garbage collected
277
278
279 (*) /proc/fs/fscache/histogram
280
281 cat /proc/fs/fscache/histogram
282 JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS
283 ===== ===== ========= ========= ========= ========= =========
284
285 This shows the breakdown of the number of times each amount of time
286 between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
287 columns are as follows:
288
289 COLUMN TIME MEASUREMENT
290 ======= =======================================================
291 OBJ INST Length of time to instantiate an object
292 OP RUNS Length of time a call to process an operation took
293 OBJ RUNS Length of time a call to process an object event took
294 RETRV DLY Time between an requesting a read and lookup completing
295 RETRIEVLS Time between beginning and end of a retrieval
296
297 Each row shows the number of events that took a particular range of times.
298 Each step is 1 jiffy in size. The JIFS column indicates the particular
299 jiffy range covered, and the SECS field the equivalent number of seconds.
300
301
302=========
303DEBUGGING
304=========
305
306If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
307debugging enabled by adjusting the value in:
308
309 /sys/module/fscache/parameters/debug
310
311This is a bitmask of debugging streams to enable:
312
313 BIT VALUE STREAM POINT
314 ======= ======= =============================== =======================
315 0 1 Cache management Function entry trace
316 1 2 Function exit trace
317 2 4 General
318 3 8 Cookie management Function entry trace
319 4 16 Function exit trace
320 5 32 General
321 6 64 Page handling Function entry trace
322 7 128 Function exit trace
323 8 256 General
324 9 512 Operation management Function entry trace
325 10 1024 Function exit trace
326 11 2048 General
327
328The appropriate set of values should be OR'd together and the result written to
329the control file. For example:
330
331 echo $((1|8|64)) >/sys/module/fscache/parameters/debug
332
333will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644
index 000000000000..4db125b3a5c6
--- /dev/null
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -0,0 +1,778 @@
1 ===============================
2 FS-CACHE NETWORK FILESYSTEM API
3 ===============================
4
5There's an API by which a network filesystem can make use of the FS-Cache
6facilities. This is based around a number of principles:
7
8 (1) Caches can store a number of different object types. There are two main
9 object types: indices and files. The first is a special type used by
10 FS-Cache to make finding objects faster and to make retiring of groups of
11 objects easier.
12
13 (2) Every index, file or other object is represented by a cookie. This cookie
14 may or may not have anything associated with it, but the netfs doesn't
15 need to care.
16
17 (3) Barring the top-level index (one entry per cached netfs), the index
18 hierarchy for each netfs is structured according the whim of the netfs.
19
20This API is declared in <linux/fscache.h>.
21
22This document contains the following sections:
23
24 (1) Network filesystem definition
25 (2) Index definition
26 (3) Object definition
27 (4) Network filesystem (un)registration
28 (5) Cache tag lookup
29 (6) Index registration
30 (7) Data file registration
31 (8) Miscellaneous object registration
32 (9) Setting the data file size
33 (10) Page alloc/read/write
34 (11) Page uncaching
35 (12) Index and data file update
36 (13) Miscellaneous cookie operations
37 (14) Cookie unregistration
38 (15) Index and data file invalidation
39 (16) FS-Cache specific page flags.
40
41
42=============================
43NETWORK FILESYSTEM DEFINITION
44=============================
45
46FS-Cache needs a description of the network filesystem. This is specified
47using a record of the following structure:
48
49 struct fscache_netfs {
50 uint32_t version;
51 const char *name;
52 struct fscache_cookie *primary_index;
53 ...
54 };
55
56This first two fields should be filled in before registration, and the third
57will be filled in by the registration function; any other fields should just be
58ignored and are for internal use only.
59
60The fields are:
61
62 (1) The name of the netfs (used as the key in the toplevel index).
63
64 (2) The version of the netfs (if the name matches but the version doesn't, the
65 entire in-cache hierarchy for this netfs will be scrapped and begun
66 afresh).
67
68 (3) The cookie representing the primary index will be allocated according to
69 another parameter passed into the registration function.
70
71For example, kAFS (linux/fs/afs/) uses the following definitions to describe
72itself:
73
74 struct fscache_netfs afs_cache_netfs = {
75 .version = 0,
76 .name = "afs",
77 };
78
79
80================
81INDEX DEFINITION
82================
83
84Indices are used for two purposes:
85
86 (1) To aid the finding of a file based on a series of keys (such as AFS's
87 "cell", "volume ID", "vnode ID").
88
89 (2) To make it easier to discard a subset of all the files cached based around
90 a particular key - for instance to mirror the removal of an AFS volume.
91
92However, since it's unlikely that any two netfs's are going to want to define
93their index hierarchies in quite the same way, FS-Cache tries to impose as few
94restraints as possible on how an index is structured and where it is placed in
95the tree. The netfs can even mix indices and data files at the same level, but
96it's not recommended.
97
98Each index entry consists of a key of indeterminate length plus some auxilliary
99data, also of indeterminate length.
100
101There are some limits on indices:
102
103 (1) Any index containing non-index objects should be restricted to a single
104 cache. Any such objects created within an index will be created in the
105 first cache only. The cache in which an index is created can be
106 controlled by cache tags (see below).
107
108 (2) The entry data must be atomically journallable, so it is limited to about
109 400 bytes at present. At least 400 bytes will be available.
110
111 (3) The depth of the index tree should be judged with care as the search
112 function is recursive. Too many layers will run the kernel out of stack.
113
114
115=================
116OBJECT DEFINITION
117=================
118
119To define an object, a structure of the following type should be filled out:
120
121 struct fscache_cookie_def
122 {
123 uint8_t name[16];
124 uint8_t type;
125
126 struct fscache_cache_tag *(*select_cache)(
127 const void *parent_netfs_data,
128 const void *cookie_netfs_data);
129
130 uint16_t (*get_key)(const void *cookie_netfs_data,
131 void *buffer,
132 uint16_t bufmax);
133
134 void (*get_attr)(const void *cookie_netfs_data,
135 uint64_t *size);
136
137 uint16_t (*get_aux)(const void *cookie_netfs_data,
138 void *buffer,
139 uint16_t bufmax);
140
141 enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
142 const void *data,
143 uint16_t datalen);
144
145 void (*get_context)(void *cookie_netfs_data, void *context);
146
147 void (*put_context)(void *cookie_netfs_data, void *context);
148
149 void (*mark_pages_cached)(void *cookie_netfs_data,
150 struct address_space *mapping,
151 struct pagevec *cached_pvec);
152
153 void (*now_uncached)(void *cookie_netfs_data);
154 };
155
156This has the following fields:
157
158 (1) The type of the object [mandatory].
159
160 This is one of the following values:
161
162 (*) FSCACHE_COOKIE_TYPE_INDEX
163
164 This defines an index, which is a special FS-Cache type.
165
166 (*) FSCACHE_COOKIE_TYPE_DATAFILE
167
168 This defines an ordinary data file.
169
170 (*) Any other value between 2 and 255
171
172 This defines an extraordinary object such as an XATTR.
173
174 (2) The name of the object type (NUL terminated unless all 16 chars are used)
175 [optional].
176
177 (3) A function to select the cache in which to store an index [optional].
178
179 This function is invoked when an index needs to be instantiated in a cache
180 during the instantiation of a non-index object. Only the immediate index
181 parent for the non-index object will be queried. Any indices above that
182 in the hierarchy may be stored in multiple caches. This function does not
183 need to be supplied for any non-index object or any index that will only
184 have index children.
185
186 If this function is not supplied or if it returns NULL then the first
187 cache in the parent's list will be chosed, or failing that, the first
188 cache in the master list.
189
190 (4) A function to retrieve an object's key from the netfs [mandatory].
191
192 This function will be called with the netfs data that was passed to the
193 cookie acquisition function and the maximum length of key data that it may
194 provide. It should write the required key data into the given buffer and
195 return the quantity it wrote.
196
197 (5) A function to retrieve attribute data from the netfs [optional].
198
199 This function will be called with the netfs data that was passed to the
200 cookie acquisition function. It should return the size of the file if
201 this is a data file. The size may be used to govern how much cache must
202 be reserved for this file in the cache.
203
204 If the function is absent, a file size of 0 is assumed.
205
206 (6) A function to retrieve auxilliary data from the netfs [optional].
207
208 This function will be called with the netfs data that was passed to the
209 cookie acquisition function and the maximum length of auxilliary data that
210 it may provide. It should write the auxilliary data into the given buffer
211 and return the quantity it wrote.
212
213 If this function is absent, the auxilliary data length will be set to 0.
214
215 The length of the auxilliary data buffer may be dependent on the key
216 length. A netfs mustn't rely on being able to provide more than 400 bytes
217 for both.
218
219 (7) A function to check the auxilliary data [optional].
220
221 This function will be called to check that a match found in the cache for
222 this object is valid. For instance with AFS it could check the auxilliary
223 data against the data version number returned by the server to determine
224 whether the index entry in a cache is still valid.
225
226 If this function is absent, it will be assumed that matching objects in a
227 cache are always valid.
228
229 If present, the function should return one of the following values:
230
231 (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is
232 (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update
233 (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted
234
235 This function can also be used to extract data from the auxilliary data in
236 the cache and copy it into the netfs's structures.
237
238 (8) A pair of functions to manage contexts for the completion callback
239 [optional].
240
241 The cache read/write functions are passed a context which is then passed
242 to the I/O completion callback function. To ensure this context remains
243 valid until after the I/O completion is called, two functions may be
244 provided: one to get an extra reference on the context, and one to drop a
245 reference to it.
246
247 If the context is not used or is a type of object that won't go out of
248 scope, then these functions are not required. These functions are not
249 required for indices as indices may not contain data. These functions may
250 be called in interrupt context and so may not sleep.
251
252 (9) A function to mark a page as retaining cache metadata [optional].
253
254 This is called by the cache to indicate that it is retaining in-memory
255 information for this page and that the netfs should uncache the page when
256 it has finished. This does not indicate whether there's data on the disk
257 or not. Note that several pages at once may be presented for marking.
258
259 The PG_fscache bit is set on the pages before this function would be
260 called, so the function need not be provided if this is sufficient.
261
262 This function is not required for indices as they're not permitted data.
263
264(10) A function to unmark all the pages retaining cache metadata [mandatory].
265
266 This is called by FS-Cache to indicate that a backing store is being
267 unbound from a cookie and that all the marks on the pages should be
268 cleared to prevent confusion. Note that the cache will have torn down all
269 its tracking information so that the pages don't need to be explicitly
270 uncached.
271
272 This function is not required for indices as they're not permitted data.
273
274
275===================================
276NETWORK FILESYSTEM (UN)REGISTRATION
277===================================
278
279The first step is to declare the network filesystem to the cache. This also
280involves specifying the layout of the primary index (for AFS, this would be the
281"cell" level).
282
283The registration function is:
284
285 int fscache_register_netfs(struct fscache_netfs *netfs);
286
287It just takes a pointer to the netfs definition. It returns 0 or an error as
288appropriate.
289
290For kAFS, registration is done as follows:
291
292 ret = fscache_register_netfs(&afs_cache_netfs);
293
294The last step is, of course, unregistration:
295
296 void fscache_unregister_netfs(struct fscache_netfs *netfs);
297
298
299================
300CACHE TAG LOOKUP
301================
302
303FS-Cache permits the use of more than one cache. To permit particular index
304subtrees to be bound to particular caches, the second step is to look up cache
305representation tags. This step is optional; it can be left entirely up to
306FS-Cache as to which cache should be used. The problem with doing that is that
307FS-Cache will always pick the first cache that was registered.
308
309To get the representation for a named tag:
310
311 struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
312
313This takes a text string as the name and returns a representation of a tag. It
314will never return an error. It may return a dummy tag, however, if it runs out
315of memory; this will inhibit caching with this tag.
316
317Any representation so obtained must be released by passing it to this function:
318
319 void fscache_release_cache_tag(struct fscache_cache_tag *tag);
320
321The tag will be retrieved by FS-Cache when it calls the object definition
322operation select_cache().
323
324
325==================
326INDEX REGISTRATION
327==================
328
329The third step is to inform FS-Cache about part of an index hierarchy that can
330be used to locate files. This is done by requesting a cookie for each index in
331the path to the file:
332
333 struct fscache_cookie *
334 fscache_acquire_cookie(struct fscache_cookie *parent,
335 const struct fscache_object_def *def,
336 void *netfs_data);
337
338This function creates an index entry in the index represented by parent,
339filling in the index entry by calling the operations pointed to by def.
340
341Note that this function never returns an error - all errors are handled
342internally. It may, however, return NULL to indicate no cookie. It is quite
343acceptable to pass this token back to this function as the parent to another
344acquisition (or even to the relinquish cookie, read page and write page
345functions - see below).
346
347Note also that no indices are actually created in a cache until a non-index
348object needs to be created somewhere down the hierarchy. Furthermore, an index
349may be created in several different caches independently at different times.
350This is all handled transparently, and the netfs doesn't see any of it.
351
352For example, with AFS, a cell would be added to the primary index. This index
353entry would have a dependent inode containing a volume location index for the
354volume mappings within this cell:
355
356 cell->cache =
357 fscache_acquire_cookie(afs_cache_netfs.primary_index,
358 &afs_cell_cache_index_def,
359 cell);
360
361Then when a volume location was accessed, it would be entered into the cell's
362index and an inode would be allocated that acts as a volume type and hash chain
363combination:
364
365 vlocation->cache =
366 fscache_acquire_cookie(cell->cache,
367 &afs_vlocation_cache_index_def,
368 vlocation);
369
370And then a particular flavour of volume (R/O for example) could be added to
371that index, creating another index for vnodes (AFS inode equivalents):
372
373 volume->cache =
374 fscache_acquire_cookie(vlocation->cache,
375 &afs_volume_cache_index_def,
376 volume);
377
378
379======================
380DATA FILE REGISTRATION
381======================
382
383The fourth step is to request a data file be created in the cache. This is
384identical to index cookie acquisition. The only difference is that the type in
385the object definition should be something other than index type.
386
387 vnode->cache =
388 fscache_acquire_cookie(volume->cache,
389 &afs_vnode_cache_object_def,
390 vnode);
391
392
393=================================
394MISCELLANEOUS OBJECT REGISTRATION
395=================================
396
397An optional step is to request an object of miscellaneous type be created in
398the cache. This is almost identical to index cookie acquisition. The only
399difference is that the type in the object definition should be something other
400than index type. Whilst the parent object could be an index, it's more likely
401it would be some other type of object such as a data file.
402
403 xattr->cache =
404 fscache_acquire_cookie(vnode->cache,
405 &afs_xattr_cache_object_def,
406 xattr);
407
408Miscellaneous objects might be used to store extended attributes or directory
409entries for example.
410
411
412==========================
413SETTING THE DATA FILE SIZE
414==========================
415
416The fifth step is to set the physical attributes of the file, such as its size.
417This doesn't automatically reserve any space in the cache, but permits the
418cache to adjust its metadata for data tracking appropriately:
419
420 int fscache_attr_changed(struct fscache_cookie *cookie);
421
422The cache will return -ENOBUFS if there is no backing cache or if there is no
423space to allocate any extra metadata required in the cache. The attributes
424will be accessed with the get_attr() cookie definition operation.
425
426Note that attempts to read or write data pages in the cache over this size may
427be rebuffed with -ENOBUFS.
428
429This operation schedules an attribute adjustment to happen asynchronously at
430some point in the future, and as such, it may happen after the function returns
431to the caller. The attribute adjustment excludes read and write operations.
432
433
434=====================
435PAGE READ/ALLOC/WRITE
436=====================
437
438And the sixth step is to store and retrieve pages in the cache. There are
439three functions that are used to do this.
440
441Note:
442
443 (1) A page should not be re-read or re-allocated without uncaching it first.
444
445 (2) A read or allocated page must be uncached when the netfs page is released
446 from the pagecache.
447
448 (3) A page should only be written to the cache if previous read or allocated.
449
450This permits the cache to maintain its page tracking in proper order.
451
452
453PAGE READ
454---------
455
456Firstly, the netfs should ask FS-Cache to examine the caches and read the
457contents cached for a particular page of a particular file if present, or else
458allocate space to store the contents if not:
459
460 typedef
461 void (*fscache_rw_complete_t)(struct page *page,
462 void *context,
463 int error);
464
465 int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
466 struct page *page,
467 fscache_rw_complete_t end_io_func,
468 void *context,
469 gfp_t gfp);
470
471The cookie argument must specify a cookie for an object that isn't an index,
472the page specified will have the data loaded into it (and is also used to
473specify the page number), and the gfp argument is used to control how any
474memory allocations made are satisfied.
475
476If the cookie indicates the inode is not cached:
477
478 (1) The function will return -ENOBUFS.
479
480Else if there's a copy of the page resident in the cache:
481
482 (1) The mark_pages_cached() cookie operation will be called on that page.
483
484 (2) The function will submit a request to read the data from the cache's
485 backing device directly into the page specified.
486
487 (3) The function will return 0.
488
489 (4) When the read is complete, end_io_func() will be invoked with:
490
491 (*) The netfs data supplied when the cookie was created.
492
493 (*) The page descriptor.
494
495 (*) The context argument passed to the above function. This will be
496 maintained with the get_context/put_context functions mentioned above.
497
498 (*) An argument that's 0 on success or negative for an error code.
499
500 If an error occurs, it should be assumed that the page contains no usable
501 data.
502
503 end_io_func() will be called in process context if the read is results in
504 an error, but it might be called in interrupt context if the read is
505 successful.
506
507Otherwise, if there's not a copy available in cache, but the cache may be able
508to store the page:
509
510 (1) The mark_pages_cached() cookie operation will be called on that page.
511
512 (2) A block may be reserved in the cache and attached to the object at the
513 appropriate place.
514
515 (3) The function will return -ENODATA.
516
517This function may also return -ENOMEM or -EINTR, in which case it won't have
518read any data from the cache.
519
520
521PAGE ALLOCATE
522-------------
523
524Alternatively, if there's not expected to be any data in the cache for a page
525because the file has been extended, a block can simply be allocated instead:
526
527 int fscache_alloc_page(struct fscache_cookie *cookie,
528 struct page *page,
529 gfp_t gfp);
530
531This is similar to the fscache_read_or_alloc_page() function, except that it
532never reads from the cache. It will return 0 if a block has been allocated,
533rather than -ENODATA as the other would. One or the other must be performed
534before writing to the cache.
535
536The mark_pages_cached() cookie operation will be called on the page if
537successful.
538
539
540PAGE WRITE
541----------
542
543Secondly, if the netfs changes the contents of the page (either due to an
544initial download or if a user performs a write), then the page should be
545written back to the cache:
546
547 int fscache_write_page(struct fscache_cookie *cookie,
548 struct page *page,
549 gfp_t gfp);
550
551The cookie argument must specify a data file cookie, the page specified should
552contain the data to be written (and is also used to specify the page number),
553and the gfp argument is used to control how any memory allocations made are
554satisfied.
555
556The page must have first been read or allocated successfully and must not have
557been uncached before writing is performed.
558
559If the cookie indicates the inode is not cached then:
560
561 (1) The function will return -ENOBUFS.
562
563Else if space can be allocated in the cache to hold this page:
564
565 (1) PG_fscache_write will be set on the page.
566
567 (2) The function will submit a request to write the data to cache's backing
568 device directly from the page specified.
569
570 (3) The function will return 0.
571
572 (4) When the write is complete PG_fscache_write is cleared on the page and
573 anyone waiting for that bit will be woken up.
574
575Else if there's no space available in the cache, -ENOBUFS will be returned. It
576is also possible for the PG_fscache_write bit to be cleared when no write took
577place if unforeseen circumstances arose (such as a disk error).
578
579Writing takes place asynchronously.
580
581
582MULTIPLE PAGE READ
583------------------
584
585A facility is provided to read several pages at once, as requested by the
586readpages() address space operation:
587
588 int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
589 struct address_space *mapping,
590 struct list_head *pages,
591 int *nr_pages,
592 fscache_rw_complete_t end_io_func,
593 void *context,
594 gfp_t gfp);
595
596This works in a similar way to fscache_read_or_alloc_page(), except:
597
598 (1) Any page it can retrieve data for is removed from pages and nr_pages and
599 dispatched for reading to the disk. Reads of adjacent pages on disk may
600 be merged for greater efficiency.
601
602 (2) The mark_pages_cached() cookie operation will be called on several pages
603 at once if they're being read or allocated.
604
605 (3) If there was an general error, then that error will be returned.
606
607 Else if some pages couldn't be allocated or read, then -ENOBUFS will be
608 returned.
609
610 Else if some pages couldn't be read but were allocated, then -ENODATA will
611 be returned.
612
613 Otherwise, if all pages had reads dispatched, then 0 will be returned, the
614 list will be empty and *nr_pages will be 0.
615
616 (4) end_io_func will be called once for each page being read as the reads
617 complete. It will be called in process context if error != 0, but it may
618 be called in interrupt context if there is no error.
619
620Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
621some of the pages being read and some being allocated. Those pages will have
622been marked appropriately and will need uncaching.
623
624
625==============
626PAGE UNCACHING
627==============
628
629To uncache a page, this function should be called:
630
631 void fscache_uncache_page(struct fscache_cookie *cookie,
632 struct page *page);
633
634This function permits the cache to release any in-memory representation it
635might be holding for this netfs page. This function must be called once for
636each page on which the read or write page functions above have been called to
637make sure the cache's in-memory tracking information gets torn down.
638
639Note that pages can't be explicitly deleted from the a data file. The whole
640data file must be retired (see the relinquish cookie function below).
641
642Furthermore, note that this does not cancel the asynchronous read or write
643operation started by the read/alloc and write functions, so the page
644invalidation and release functions must use:
645
646 bool fscache_check_page_write(struct fscache_cookie *cookie,
647 struct page *page);
648
649to see if a page is being written to the cache, and:
650
651 void fscache_wait_on_page_write(struct fscache_cookie *cookie,
652 struct page *page);
653
654to wait for it to finish if it is.
655
656
657==========================
658INDEX AND DATA FILE UPDATE
659==========================
660
661To request an update of the index data for an index or other object, the
662following function should be called:
663
664 void fscache_update_cookie(struct fscache_cookie *cookie);
665
666This function will refer back to the netfs_data pointer stored in the cookie by
667the acquisition function to obtain the data to write into each revised index
668entry. The update method in the parent index definition will be called to
669transfer the data.
670
671Note that partial updates may happen automatically at other times, such as when
672data blocks are added to a data file object.
673
674
675===============================
676MISCELLANEOUS COOKIE OPERATIONS
677===============================
678
679There are a number of operations that can be used to control cookies:
680
681 (*) Cookie pinning:
682
683 int fscache_pin_cookie(struct fscache_cookie *cookie);
684 void fscache_unpin_cookie(struct fscache_cookie *cookie);
685
686 These operations permit data cookies to be pinned into the cache and to
687 have the pinning removed. They are not permitted on index cookies.
688
689 The pinning function will return 0 if successful, -ENOBUFS in the cookie
690 isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
691 -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
692 -EIO if there's any other problem.
693
694 (*) Data space reservation:
695
696 int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
697
698 This permits a netfs to request cache space be reserved to store up to the
699 given amount of a file. It is permitted to ask for more than the current
700 size of the file to allow for future file expansion.
701
702 If size is given as zero then the reservation will be cancelled.
703
704 The function will return 0 if successful, -ENOBUFS in the cookie isn't
705 backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
706 -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
707 -EIO if there's any other problem.
708
709 Note that this doesn't pin an object in a cache; it can still be culled to
710 make space if it's not in use.
711
712
713=====================
714COOKIE UNREGISTRATION
715=====================
716
717To get rid of a cookie, this function should be called.
718
719 void fscache_relinquish_cookie(struct fscache_cookie *cookie,
720 int retire);
721
722If retire is non-zero, then the object will be marked for recycling, and all
723copies of it will be removed from all active caches in which it is present.
724Not only that but all child objects will also be retired.
725
726If retire is zero, then the object may be available again when next the
727acquisition function is called. Retirement here will overrule the pinning on a
728cookie.
729
730One very important note - relinquish must NOT be called for a cookie unless all
731the cookies for "child" indices, objects and pages have been relinquished
732first.
733
734
735================================
736INDEX AND DATA FILE INVALIDATION
737================================
738
739There is no direct way to invalidate an index subtree or a data file. To do
740this, the caller should relinquish and retire the cookie they have, and then
741acquire a new one.
742
743
744===========================
745FS-CACHE SPECIFIC PAGE FLAG
746===========================
747
748FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is
749given the alternative name PG_fscache.
750
751PG_fscache is used to indicate that the page is known by the cache, and that
752the cache must be informed if the page is going to go away. It's an indication
753to the netfs that the cache has an interest in this page, where an interest may
754be a pointer to it, resources allocated or reserved for it, or I/O in progress
755upon it.
756
757The netfs can use this information in methods such as releasepage() to
758determine whether it needs to uncache a page or update it.
759
760Furthermore, if this bit is set, releasepage() and invalidatepage() operations
761will be called on a page to get rid of it, even if PG_private is not set. This
762allows caching to attempted on a page before read_cache_pages() to be called
763after fscache_read_or_alloc_pages() as the former will try and release pages it
764was given under certain circumstances.
765
766This bit does not overlap with such as PG_private. This means that FS-Cache
767can be used with a filesystem that uses the block buffering code.
768
769There are a number of operations defined on this flag:
770
771 int PageFsCache(struct page *page);
772 void SetPageFsCache(struct page *page)
773 void ClearPageFsCache(struct page *page)
774 int TestSetPageFsCache(struct page *page)
775 int TestClearPageFsCache(struct page *page)
776
777These functions are bit test, bit set, bit clear, bit test and set and bit
778test and clear operations on PG_fscache.
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt
new file mode 100644
index 000000000000..e8b0a35d8fe5
--- /dev/null
+++ b/Documentation/filesystems/caching/object.txt
@@ -0,0 +1,313 @@
1 ====================================================
2 IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT
3 ====================================================
4
5By: David Howells <dhowells@redhat.com>
6
7Contents:
8
9 (*) Representation
10
11 (*) Object management state machine.
12
13 - Provision of cpu time.
14 - Locking simplification.
15
16 (*) The set of states.
17
18 (*) The set of events.
19
20
21==============
22REPRESENTATION
23==============
24
25FS-Cache maintains an in-kernel representation of each object that a netfs is
26currently interested in. Such objects are represented by the fscache_cookie
27struct and are referred to as cookies.
28
29FS-Cache also maintains a separate in-kernel representation of the objects that
30a cache backend is currently actively caching. Such objects are represented by
31the fscache_object struct. The cache backends allocate these upon request, and
32are expected to embed them in their own representations. These are referred to
33as objects.
34
35There is a 1:N relationship between cookies and objects. A cookie may be
36represented by multiple objects - an index may exist in more than one cache -
37or even by no objects (it may not be cached).
38
39Furthermore, both cookies and objects are hierarchical. The two hierarchies
40correspond, but the cookies tree is a superset of the union of the object trees
41of multiple caches:
42
43 NETFS INDEX TREE : CACHE 1 : CACHE 2
44 : :
45 : +-----------+ :
46 +----------->| IObject | :
47 +-----------+ | : +-----------+ :
48 | ICookie |-------+ : | :
49 +-----------+ | : | : +-----------+
50 | +------------------------------>| IObject |
51 | : | : +-----------+
52 | : V : |
53 | : +-----------+ : |
54 V +----------->| IObject | : |
55 +-----------+ | : +-----------+ : |
56 | ICookie |-------+ : | : V
57 +-----------+ | : | : +-----------+
58 | +------------------------------>| IObject |
59 +-----+-----+ : | : +-----------+
60 | | : | : |
61 V | : V : |
62 +-----------+ | : +-----------+ : |
63 | ICookie |------------------------->| IObject | : |
64 +-----------+ | : +-----------+ : |
65 | V : | : V
66 | +-----------+ : | : +-----------+
67 | | ICookie |-------------------------------->| IObject |
68 | +-----------+ : | : +-----------+
69 V | : V : |
70 +-----------+ | : +-----------+ : |
71 | DCookie |------------------------->| DObject | : |
72 +-----------+ | : +-----------+ : |
73 | : : |
74 +-------+-------+ : : |
75 | | : : |
76 V V : : V
77 +-----------+ +-----------+ : : +-----------+
78 | DCookie | | DCookie |------------------------>| DObject |
79 +-----------+ +-----------+ : : +-----------+
80 : :
81
82In the above illustration, ICookie and IObject represent indices and DCookie
83and DObject represent data storage objects. Indices may have representation in
84multiple caches, but currently, non-index objects may not. Objects of any type
85may also be entirely unrepresented.
86
87As far as the netfs API goes, the netfs is only actually permitted to see
88pointers to the cookies. The cookies themselves and any objects attached to
89those cookies are hidden from it.
90
91
92===============================
93OBJECT MANAGEMENT STATE MACHINE
94===============================
95
96Within FS-Cache, each active object is managed by its own individual state
97machine. The state for an object is kept in the fscache_object struct, in
98object->state. A cookie may point to a set of objects that are in different
99states.
100
101Each state has an action associated with it that is invoked when the machine
102wakes up in that state. There are four logical sets of states:
103
104 (1) Preparation: states that wait for the parent objects to become ready. The
105 representations are hierarchical, and it is expected that an object must
106 be created or accessed with respect to its parent object.
107
108 (2) Initialisation: states that perform lookups in the cache and validate
109 what's found and that create on disk any missing metadata.
110
111 (3) Normal running: states that allow netfs operations on objects to proceed
112 and that update the state of objects.
113
114 (4) Termination: states that detach objects from their netfs cookies, that
115 delete objects from disk, that handle disk and system errors and that free
116 up in-memory resources.
117
118
119In most cases, transitioning between states is in response to signalled events.
120When a state has finished processing, it will usually set the mask of events in
121which it is interested (object->event_mask) and relinquish the worker thread.
122Then when an event is raised (by calling fscache_raise_event()), if the event
123is not masked, the object will be queued for processing (by calling
124fscache_enqueue_object()).
125
126
127PROVISION OF CPU TIME
128---------------------
129
130The work to be done by the various states is given CPU time by the threads of
131the slow work facility (see Documentation/slow-work.txt). This is used in
132preference to the workqueue facility because:
133
134 (1) Threads may be completely occupied for very long periods of time by a
135 particular work item. These state actions may be doing sequences of
136 synchronous, journalled disk accesses (lookup, mkdir, create, setxattr,
137 getxattr, truncate, unlink, rmdir, rename).
138
139 (2) Threads may do little actual work, but may rather spend a lot of time
140 sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded
141 workqueues don't necessarily have the right numbers of threads.
142
143
144LOCKING SIMPLIFICATION
145----------------------
146
147Because only one worker thread may be operating on any particular object's
148state machine at once, this simplifies the locking, particularly with respect
149to disconnecting the netfs's representation of a cache object (fscache_cookie)
150from the cache backend's representation (fscache_object) - which may be
151requested from either end.
152
153
154=================
155THE SET OF STATES
156=================
157
158The object state machine has a set of states that it can be in. There are
159preparation states in which the object sets itself up and waits for its parent
160object to transit to a state that allows access to its children:
161
162 (1) State FSCACHE_OBJECT_INIT.
163
164 Initialise the object and wait for the parent object to become active. In
165 the cache, it is expected that it will not be possible to look an object
166 up from the parent object, until that parent object itself has been looked
167 up.
168
169There are initialisation states in which the object sets itself up and accesses
170disk for the object metadata:
171
172 (2) State FSCACHE_OBJECT_LOOKING_UP.
173
174 Look up the object on disk, using the parent as a starting point.
175 FS-Cache expects the cache backend to probe the cache to see whether this
176 object is represented there, and if it is, to see if it's valid (coherency
177 management).
178
179 The cache should call fscache_object_lookup_negative() to indicate lookup
180 failure for whatever reason, and should call fscache_obtained_object() to
181 indicate success.
182
183 At the completion of lookup, FS-Cache will let the netfs go ahead with
184 read operations, no matter whether the file is yet cached. If not yet
185 cached, read operations will be immediately rejected with ENODATA until
186 the first known page is uncached - as to that point there can be no data
187 to be read out of the cache for that file that isn't currently also held
188 in the pagecache.
189
190 (3) State FSCACHE_OBJECT_CREATING.
191
192 Create an object on disk, using the parent as a starting point. This
193 happens if the lookup failed to find the object, or if the object's
194 coherency data indicated what's on disk is out of date. In this state,
195 FS-Cache expects the cache to create
196
197 The cache should call fscache_obtained_object() if creation completes
198 successfully, fscache_object_lookup_negative() otherwise.
199
200 At the completion of creation, FS-Cache will start processing write
201 operations the netfs has queued for an object. If creation failed, the
202 write ops will be transparently discarded, and nothing recorded in the
203 cache.
204
205There are some normal running states in which the object spends its time
206servicing netfs requests:
207
208 (4) State FSCACHE_OBJECT_AVAILABLE.
209
210 A transient state in which pending operations are started, child objects
211 are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary
212 lookup data is freed.
213
214 (5) State FSCACHE_OBJECT_ACTIVE.
215
216 The normal running state. In this state, requests the netfs makes will be
217 passed on to the cache.
218
219 (6) State FSCACHE_OBJECT_UPDATING.
220
221 The state machine comes here to update the object in the cache from the
222 netfs's records. This involves updating the auxiliary data that is used
223 to maintain coherency.
224
225And there are terminal states in which an object cleans itself up, deallocates
226memory and potentially deletes stuff from disk:
227
228 (7) State FSCACHE_OBJECT_LC_DYING.
229
230 The object comes here if it is dying because of a lookup or creation
231 error. This would be due to a disk error or system error of some sort.
232 Temporary data is cleaned up, and the parent is released.
233
234 (8) State FSCACHE_OBJECT_DYING.
235
236 The object comes here if it is dying due to an error, because its parent
237 cookie has been relinquished by the netfs or because the cache is being
238 withdrawn.
239
240 Any child objects waiting on this one are given CPU time so that they too
241 can destroy themselves. This object waits for all its children to go away
242 before advancing to the next state.
243
244 (9) State FSCACHE_OBJECT_ABORT_INIT.
245
246 The object comes to this state if it was waiting on its parent in
247 FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself
248 so that the parent may proceed from the FSCACHE_OBJECT_DYING state.
249
250(10) State FSCACHE_OBJECT_RELEASING.
251(11) State FSCACHE_OBJECT_RECYCLING.
252
253 The object comes to one of these two states when dying once it is rid of
254 all its children, if it is dying because the netfs relinquished its
255 cookie. In the first state, the cached data is expected to persist, and
256 in the second it will be deleted.
257
258(12) State FSCACHE_OBJECT_WITHDRAWING.
259
260 The object transits to this state if the cache decides it wants to
261 withdraw the object from service, perhaps to make space, but also due to
262 error or just because the whole cache is being withdrawn.
263
264(13) State FSCACHE_OBJECT_DEAD.
265
266 The object transits to this state when the in-memory object record is
267 ready to be deleted. The object processor shouldn't ever see an object in
268 this state.
269
270
271THE SET OF EVENTS
272-----------------
273
274There are a number of events that can be raised to an object state machine:
275
276 (*) FSCACHE_OBJECT_EV_UPDATE
277
278 The netfs requested that an object be updated. The state machine will ask
279 the cache backend to update the object, and the cache backend will ask the
280 netfs for details of the change through its cookie definition ops.
281
282 (*) FSCACHE_OBJECT_EV_CLEARED
283
284 This is signalled in two circumstances:
285
286 (a) when an object's last child object is dropped and
287
288 (b) when the last operation outstanding on an object is completed.
289
290 This is used to proceed from the dying state.
291
292 (*) FSCACHE_OBJECT_EV_ERROR
293
294 This is signalled when an I/O error occurs during the processing of some
295 object.
296
297 (*) FSCACHE_OBJECT_EV_RELEASE
298 (*) FSCACHE_OBJECT_EV_RETIRE
299
300 These are signalled when the netfs relinquishes a cookie it was using.
301 The event selected depends on whether the netfs asks for the backing
302 object to be retired (deleted) or retained.
303
304 (*) FSCACHE_OBJECT_EV_WITHDRAW
305
306 This is signalled when the cache backend wants to withdraw an object.
307 This means that the object will have to be detached from the netfs's
308 cookie.
309
310Because the withdrawing releasing/retiring events are all handled by the object
311state machine, it doesn't matter if there's a collision with both ends trying
312to sever the connection at the same time. The state machine can just pick
313which one it wants to honour, and that effects the other.
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
new file mode 100644
index 000000000000..b6b070c57cbf
--- /dev/null
+++ b/Documentation/filesystems/caching/operations.txt
@@ -0,0 +1,213 @@
1 ================================
2 ASYNCHRONOUS OPERATIONS HANDLING
3 ================================
4
5By: David Howells <dhowells@redhat.com>
6
7Contents:
8
9 (*) Overview.
10
11 (*) Operation record initialisation.
12
13 (*) Parameters.
14
15 (*) Procedure.
16
17 (*) Asynchronous callback.
18
19
20========
21OVERVIEW
22========
23
24FS-Cache has an asynchronous operations handling facility that it uses for its
25data storage and retrieval routines. Its operations are represented by
26fscache_operation structs, though these are usually embedded into some other
27structure.
28
29This facility is available to and expected to be be used by the cache backends,
30and FS-Cache will create operations and pass them off to the appropriate cache
31backend for completion.
32
33To make use of this facility, <linux/fscache-cache.h> should be #included.
34
35
36===============================
37OPERATION RECORD INITIALISATION
38===============================
39
40An operation is recorded in an fscache_operation struct:
41
42 struct fscache_operation {
43 union {
44 struct work_struct fast_work;
45 struct slow_work slow_work;
46 };
47 unsigned long flags;
48 fscache_operation_processor_t processor;
49 ...
50 };
51
52Someone wanting to issue an operation should allocate something with this
53struct embedded in it. They should initialise it by calling:
54
55 void fscache_operation_init(struct fscache_operation *op,
56 fscache_operation_release_t release);
57
58with the operation to be initialised and the release function to use.
59
60The op->flags parameter should be set to indicate the CPU time provision and
61the exclusivity (see the Parameters section).
62
63The op->fast_work, op->slow_work and op->processor flags should be set as
64appropriate for the CPU time provision (see the Parameters section).
65
66FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the
67operation and waited for afterwards.
68
69
70==========
71PARAMETERS
72==========
73
74There are a number of parameters that can be set in the operation record's flag
75parameter. There are three options for the provision of CPU time in these
76operations:
77
78 (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread
79 may decide it wants to handle an operation itself without deferring it to
80 another thread.
81
82 This is, for example, used in read operations for calling readpages() on
83 the backing filesystem in CacheFiles. Although readpages() does an
84 asynchronous data fetch, the determination of whether pages exist is done
85 synchronously - and the netfs does not proceed until this has been
86 determined.
87
88 If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags
89 before submitting the operation, and the operating thread must wait for it
90 to be cleared before proceeding:
91
92 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
93 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
94
95
96 (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it
97 will be given to keventd to process. Such an operation is not permitted
98 to sleep on I/O.
99
100 This is, for example, used by CacheFiles to copy data from a backing fs
101 page to a netfs page after the backing fs has read the page in.
102
103 If this option is used, op->fast_work and op->processor must be
104 initialised before submitting the operation:
105
106 INIT_WORK(&op->fast_work, do_some_work);
107
108
109 (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it
110 will be given to the slow work facility to process. Such an operation is
111 permitted to sleep on I/O.
112
113 This is, for example, used by FS-Cache to handle background writes of
114 pages that have just been fetched from a remote server.
115
116 If this option is used, op->slow_work and op->processor must be
117 initialised before submitting the operation:
118
119 fscache_operation_init_slow(op, processor)
120
121
122Furthermore, operations may be one of two types:
123
124 (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in
125 conjunction with any other operation on the object being operated upon.
126
127 An example of this is the attribute change operation, in which the file
128 being written to may need truncation.
129
130 (2) Shareable. Operations of this type may be running simultaneously. It's
131 up to the operation implementation to prevent interference between other
132 operations running at the same time.
133
134
135=========
136PROCEDURE
137=========
138
139Operations are used through the following procedure:
140
141 (1) The submitting thread must allocate the operation and initialise it
142 itself. Normally this would be part of a more specific structure with the
143 generic op embedded within.
144
145 (2) The submitting thread must then submit the operation for processing using
146 one of the following two functions:
147
148 int fscache_submit_op(struct fscache_object *object,
149 struct fscache_operation *op);
150
151 int fscache_submit_exclusive_op(struct fscache_object *object,
152 struct fscache_operation *op);
153
154 The first function should be used to submit non-exclusive ops and the
155 second to submit exclusive ones. The caller must still set the
156 FSCACHE_OP_EXCLUSIVE flag.
157
158 If successful, both functions will assign the operation to the specified
159 object and return 0. -ENOBUFS will be returned if the object specified is
160 permanently unavailable.
161
162 The operation manager will defer operations on an object that is still
163 undergoing lookup or creation. The operation will also be deferred if an
164 operation of conflicting exclusivity is in progress on the object.
165
166 If the operation is asynchronous, the manager will retain a reference to
167 it, so the caller should put their reference to it by passing it to:
168
169 void fscache_put_operation(struct fscache_operation *op);
170
171 (3) If the submitting thread wants to do the work itself, and has marked the
172 operation with FSCACHE_OP_MYTHREAD, then it should monitor
173 FSCACHE_OP_WAITING as described above and check the state of the object if
174 necessary (the object might have died whilst the thread was waiting).
175
176 When it has finished doing its processing, it should call
177 fscache_put_operation() on it.
178
179 (4) The operation holds an effective lock upon the object, preventing other
180 exclusive ops conflicting until it is released. The operation can be
181 enqueued for further immediate asynchronous processing by adjusting the
182 CPU time provisioning option if necessary, eg:
183
184 op->flags &= ~FSCACHE_OP_TYPE;
185 op->flags |= ~FSCACHE_OP_FAST;
186
187 and calling:
188
189 void fscache_enqueue_operation(struct fscache_operation *op)
190
191 This can be used to allow other things to have use of the worker thread
192 pools.
193
194
195=====================
196ASYNCHRONOUS CALLBACK
197=====================
198
199When used in asynchronous mode, the worker thread pool will invoke the
200processor method with a pointer to the operation. This should then get at the
201container struct by using container_of():
202
203 static void fscache_write_op(struct fscache_operation *_op)
204 {
205 struct fscache_storage *op =
206 container_of(_op, struct fscache_storage, op);
207 ...
208 }
209
210The caller holds a reference on the operation, and will invoke
211fscache_put_operation() when the processor function returns. The processor
212function is at liberty to call fscache_enqueue_operation() or to take extra
213references.
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644
index 000000000000..0ced74c2f73c
--- /dev/null
+++ b/Documentation/filesystems/exofs.txt
@@ -0,0 +1,176 @@
1===============================================================================
2WHAT IS EXOFS?
3===============================================================================
4
5exofs is a file system that uses an OSD and exports the API of a normal Linux
6file system. Users access exofs like any other local file system, and exofs
7will in turn issue commands to the local OSD initiator.
8
9OSD is a new T10 command set that views storage devices not as a large/flat
10array of sectors but as a container of objects, each having a length, quota,
11time attributes and more. Each object is addressed by a 64bit ID, and is
12contained in a 64bit ID partition. Each object has associated attributes
13attached to it, which are integral part of the object and provide metadata about
14the object. The standard defines some common obligatory attributes, but user
15attributes can be added as needed.
16
17===============================================================================
18ENVIRONMENT
19===============================================================================
20
21To use this file system, you need to have an object store to run it on. You
22may download a target from:
23http://open-osd.org
24
25See Documentation/scsi/osd.txt for how to setup a working osd environment.
26
27===============================================================================
28USAGE
29===============================================================================
30
311. Download and compile exofs and open-osd initiator:
32 You need an external Kernel source tree or kernel headers from your
33 distribution. (anything based on 2.6.26 or later).
34
35 a. download open-osd including exofs source using:
36 [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
37
38 b. Build the library module like this:
39 [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
40
41 This will build both the open-osd initiator as well as the exofs kernel
42 module. Use whatever parameters you compiled your Kernel with and
43 $(KER_DIR) above pointing to the Kernel you compile against. See the file
44 open-osd/top-level-Makefile for an example.
45
462. Get the OSD initiator and target set up properly, and login to the target.
47 See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
48 for example script that does all these steps.
49
503. Insmod the exofs.ko module:
51 [exofs]$ insmod exofs.ko
52
534. Make sure the directory where you want to mount exists. If not, create it.
54 (For example, mkdir /mnt/exofs)
55
565. At first run you will need to invoke the mkfs.exofs application
57
58 As an example, this will create the file system on:
59 /dev/osd0 partition ID 65536
60
61 mkfs.exofs --pid=65536 --format /dev/osd0
62
63 The --format is optional if not specified no OSD_FORMAT will be
64 preformed and a clean file system will be created in the specified pid,
65 in the available space of the target. (Use --format=size_in_meg to limit
66 the total LUN space available)
67
68 If pid already exist it will be deleted and a new one will be created in it's
69 place. Be careful.
70
71 An exofs lives inside a single OSD partition. You can create multiple exofs
72 filesystems on the same device using multiple pids.
73
74 (run mkfs.exofs without any parameters for usage help message)
75
766. Mount the file system.
77
78 For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
79
80 mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
81
827. For reference (See do-exofs example script):
83 do-exofs start - an example of how to perform the above steps.
84 do-exofs stop - an example of how to unmount the file system.
85 do-exofs format - an example of how to format and mkfs a new exofs.
86
878. Extra compilation flags (uncomment in fs/exofs/Kbuild):
88 CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
89
90===============================================================================
91exofs mount options
92===============================================================================
93Similar to any mount command:
94 mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
95
96Where:
97 -t exofs: specifies the exofs file system
98
99 /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
100 login into an OSD target.
101
102 mount_exofs_directory: The directory to mount the file system on
103
104 exofs specific options: Options are separated by commas (,)
105 pid=<integer> - The partition number to mount/create as
106 container of the filesystem.
107 This option is mandatory
108 to=<integer> - Timeout in ticks for a single command
109 default is (60 * HZ) [for debugging only]
110
111===============================================================================
112DESIGN
113===============================================================================
114
115* The file system control block (AKA on-disk superblock) resides in an object
116 with a special ID (defined in common.h).
117 Information included in the file system control block is used to fill the
118 in-memory superblock structure at mount time. This object is created before
119 the file system is used by mkexofs.c It contains information such as:
120 - The file system's magic number
121 - The next inode number to be allocated
122
123* Each file resides in its own object and contains the data (and it will be
124 possible to extend the file over multiple objects, though this has not been
125 implemented yet).
126
127* A directory is treated as a file, and essentially contains a list of <file
128 name, inode #> pairs for files that are found in that directory. The object
129 IDs correspond to the files' inode numbers and will be allocated according to
130 a bitmap (stored in a separate object). Now they are allocated using a
131 counter.
132
133* Each file's control block (AKA on-disk inode) is stored in its object's
134 attributes. This applies to both regular files and other types (directories,
135 device files, symlinks, etc.).
136
137* Credentials are generated per object (inode and superblock) when they is
138 created in memory (read off disk or created). The credential works for all
139 operations and is used as long as the object remains in memory.
140
141* Async OSD operations are used whenever possible, but the target may execute
142 them out of order. The operations that concern us are create, delete,
143 readpage, writepage, update_inode, and truncate. The following pairs of
144 operations should execute in the order written, and we need to prevent them
145 from executing in reverse order:
146 - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
147 flags. OBJ_CREATED is set when we know the object exists on the OSD -
148 in create's callback function, and when we successfully do a read_inode.
149 OBJ_2BCREATED is set in the beginning of the create function, so we
150 know that we should wait.
151 - create/delete: delete should wait until the object is created
152 on the OSD.
153 - create/readpage: readpage should be able to return a page
154 full of zeroes in this case. If there was a write already
155 en-route (i.e. create, writepage, readpage) then the page
156 would be locked, and so it would really be the same as
157 create/writepage.
158 - create/writepage: if writepage is called for a sync write, it
159 should wait until the object is created on the OSD.
160 Otherwise, it should just return.
161 - create/truncate: truncate should wait until the object is
162 created on the OSD.
163 - create/update_inode: update_inode should wait until the
164 object is created on the OSD.
165 - Handled by VFS locks:
166 - readpage/delete: shouldn't happen because of page lock.
167 - writepage/delete: shouldn't happen because of page lock.
168 - readpage/writepage: shouldn't happen because of page lock.
169
170===============================================================================
171LICENSE/COPYRIGHT
172===============================================================================
173The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
174version 2.6.10). All files include the original copyrights, and the license
175is GPL version 2 (only version 2, as is true for the Linux kernel). The
176Linux kernel can be downloaded from www.kernel.org.
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index 4333e836c495..e055acb6b2d4 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -373,10 +373,11 @@ Filesystem Resizing http://ext2resize.sourceforge.net/
373Compression (*) http://e2compr.sourceforge.net/ 373Compression (*) http://e2compr.sourceforge.net/
374 374
375Implementations for: 375Implementations for:
376Windows 95/98/NT/2000 http://uranus.it.swin.edu.au/~jn/linux/Explore2fs.htm 376Windows 95/98/NT/2000 http://www.chrysocome.net/explore2fs
377Windows 95 (*) http://www.yipton.demon.co.uk/content.html#FSDEXT2 377Windows 95 (*) http://www.yipton.net/content.html#FSDEXT2
378DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/ 378DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
379OS/2 http://perso.wanadoo.fr/matthieu.willm/ext2-os2/ 379OS/2 (+) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
380RISC OS client ftp://ftp.barnet.ac.uk/pub/acorn/armlinux/iscafs/ 380RISC OS client http://www.esw-heim.tu-clausthal.de/~marco/smorbrod/IscaFS/
381 381
382(*) no longer actively developed/supported (as of Apr 2001) 382(*) no longer actively developed/supported (as of Apr 2001)
383(+) no longer actively developed/supported (as of Mar 2009)
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 9dd2a3bb2acc..570f9bd9be2b 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -14,6 +14,11 @@ Options
14When mounting an ext3 filesystem, the following option are accepted: 14When mounting an ext3 filesystem, the following option are accepted:
15(*) == default 15(*) == default
16 16
17ro Mount filesystem read only. Note that ext3 will replay
18 the journal (and thus write to the partition) even when
19 mounted "read only". Mount options "ro,noload" can be
20 used to prevent writes to the filesystem.
21
17journal=update Update the ext3 file system's journal to the current 22journal=update Update the ext3 file system's journal to the current
18 format. 23 format.
19 24
@@ -27,7 +32,9 @@ journal_dev=devnum When the external journal device's major/minor numbers
27 identified through its new major/minor numbers encoded 32 identified through its new major/minor numbers encoded
28 in devnum. 33 in devnum.
29 34
30noload Don't load the journal on mounting. 35noload Don't load the journal on mounting. Note that this forces
36 mount of inconsistent filesystem, which can lead to
37 various problems.
31 38
32data=journal All data are committed into the journal prior to being 39data=journal All data are committed into the journal prior to being
33 written into the main file system. 40 written into the main file system.
@@ -92,9 +99,12 @@ nocheck
92 99
93debug Extra debugging information is sent to syslog. 100debug Extra debugging information is sent to syslog.
94 101
95errors=remount-ro(*) Remount the filesystem read-only on an error. 102errors=remount-ro Remount the filesystem read-only on an error.
96errors=continue Keep going on a filesystem error. 103errors=continue Keep going on a filesystem error.
97errors=panic Panic and halt the machine if an error occurs. 104errors=panic Panic and halt the machine if an error occurs.
105 (These mount options override the errors behavior
106 specified in the superblock, which can be
107 configured using tune2fs.)
98 108
99data_err=ignore(*) Just print an error message if an error occurs 109data_err=ignore(*) Just print an error message if an error occurs
100 in a file data buffer in ordered mode. 110 in a file data buffer in ordered mode.
@@ -198,5 +208,5 @@ kernel source: <file:fs/ext3/>
198programs: http://e2fsprogs.sourceforge.net/ 208programs: http://e2fsprogs.sourceforge.net/
199 http://ext2resize.sourceforge.net 209 http://ext2resize.sourceforge.net
200 210
201useful links: http://www-106.ibm.com/developerworks/linux/library/l-fs7/ 211useful links: http://www.ibm.com/developerworks/library/l-fs7.html
202 http://www-106.ibm.com/developerworks/linux/library/l-fs8/ 212 http://www.ibm.com/developerworks/library/l-fs8.html
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc7291..97882df04865 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
85* extent format more robust in face of on-disk corruption due to magics, 85* extent format more robust in face of on-disk corruption due to magics,
86* internal redundancy in tree 86* internal redundancy in tree
87* improved file allocation (multi-block alloc) 87* improved file allocation (multi-block alloc)
88* fix 32000 subdirectory limit 88* lift 32000 subdirectory limit imposed by i_links_count[1]
89* nsec timestamps for mtime, atime, ctime, create time 89* nsec timestamps for mtime, atime, ctime, create time
90* inode version field on disk (NFSv4, Lustre) 90* inode version field on disk (NFSv4, Lustre)
91* reduced e2fsck time via uninit_bg feature 91* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force 100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
101 the ordering) 101 the ordering)
102 102
103[1] Filesystems with a block size of 1k may see a limit imposed by the
104directory hash tree having a maximum depth of two.
105
1032.2 Candidate features for future inclusion 1062.2 Candidate features for future inclusion
104 107
105* Online defrag (patches available but not well tested) 108* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
180 performance. 183 performance.
181 184
182barrier=<0|1(*)> This enables/disables the use of write barriers in 185barrier=<0|1(*)> This enables/disables the use of write barriers in
183 the jbd code. barrier=0 disables, barrier=1 enables. 186barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
184 This also requires an IO stack which can support 187nobarrier This also requires an IO stack which can support
185 barriers, and if jbd gets an error on a barrier 188 barriers, and if jbd gets an error on a barrier
186 write, it will disable again with a warning. 189 write, it will disable again with a warning.
187 Write barriers enforce proper on-disk ordering 190 Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
189 safe to use, at some performance penalty. If 192 safe to use, at some performance penalty. If
190 your disks are battery-backed in one way or another, 193 your disks are battery-backed in one way or another,
191 disabling barriers may safely improve performance. 194 disabling barriers may safely improve performance.
195 The mount options "barrier" and "nobarrier" can
196 also be used to enable or disable barriers, for
197 consistency with other ext4 mount options.
192 198
193inode_readahead=n This tuning parameter controls the maximum 199inode_readahead=n This tuning parameter controls the maximum
194 number of inode table blocks that ext4's inode 200 number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
310 a slightly higher priority than the default I/O 316 a slightly higher priority than the default I/O
311 priority. 317 priority.
312 318
319auto_da_alloc(*) Many broken applications don't use fsync() when
320noauto_da_alloc replacing existing files via patterns such as
321 fd = open("foo.new")/write(fd,..)/close(fd)/
322 rename("foo.new", "foo"), or worse yet,
323 fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
324 If auto_da_alloc is enabled, ext4 will detect
325 the replace-via-rename and replace-via-truncate
326 patterns and force that any delayed allocation
327 blocks are allocated such that at the next
328 journal commit, in the default data=ordered
329 mode, the data blocks of the new file are forced
330 to disk before the rename() operation is
331 commited. This provides roughly the same level
332 of guarantees as ext3, and avoids the
333 "zero-length" problem that can happen when a
334 system crashes before the delayed allocation
335 blocks are forced to disk.
336
313Data Mode 337Data Mode
314========= 338=========
315There are 3 different data modes: 339There are 3 different data modes:
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt
new file mode 100644
index 000000000000..64ced5149d37
--- /dev/null
+++ b/Documentation/filesystems/knfsd-stats.txt
@@ -0,0 +1,159 @@
1
2Kernel NFS Server Statistics
3============================
4
5This document describes the format and semantics of the statistics
6which the kernel NFS server makes available to userspace. These
7statistics are available in several text form pseudo files, each of
8which is described separately below.
9
10In most cases you don't need to know these formats, as the nfsstat(8)
11program from the nfs-utils distribution provides a helpful command-line
12interface for extracting and printing them.
13
14All the files described here are formatted as a sequence of text lines,
15separated by newline '\n' characters. Lines beginning with a hash
16'#' character are comments intended for humans and should be ignored
17by parsing routines. All other lines contain a sequence of fields
18separated by whitespace.
19
20/proc/fs/nfsd/pool_stats
21------------------------
22
23This file is available in kernels from 2.6.30 onwards, if the
24/proc/fs/nfsd filesystem is mounted (it almost always should be).
25
26The first line is a comment which describes the fields present in
27all the other lines. The other lines present the following data as
28a sequence of unsigned decimal numeric fields. One line is shown
29for each NFS thread pool.
30
31All counters are 64 bits wide and wrap naturally. There is no way
32to zero these counters, instead applications should do their own
33rate conversion.
34
35pool
36 The id number of the NFS thread pool to which this line applies.
37 This number does not change.
38
39 Thread pool ids are a contiguous set of small integers starting
40 at zero. The maximum value depends on the thread pool mode, but
41 currently cannot be larger than the number of CPUs in the system.
42 Note that in the default case there will be a single thread pool
43 which contains all the nfsd threads and all the CPUs in the system,
44 and thus this file will have a single line with a pool id of "0".
45
46packets-arrived
47 Counts how many NFS packets have arrived. More precisely, this
48 is the number of times that the network stack has notified the
49 sunrpc server layer that new data may be available on a transport
50 (e.g. an NFS or UDP socket or an NFS/RDMA endpoint).
51
52 Depending on the NFS workload patterns and various network stack
53 effects (such as Large Receive Offload) which can combine packets
54 on the wire, this may be either more or less than the number
55 of NFS calls received (which statistic is available elsewhere).
56 However this is a more accurate and less workload-dependent measure
57 of how much CPU load is being placed on the sunrpc server layer
58 due to NFS network traffic.
59
60sockets-enqueued
61 Counts how many times an NFS transport is enqueued to wait for
62 an nfsd thread to service it, i.e. no nfsd thread was considered
63 available.
64
65 The circumstance this statistic tracks indicates that there was NFS
66 network-facing work to be done but it couldn't be done immediately,
67 thus introducing a small delay in servicing NFS calls. The ideal
68 rate of change for this counter is zero; significantly non-zero
69 values may indicate a performance limitation.
70
71 This can happen either because there are too few nfsd threads in the
72 thread pool for the NFS workload (the workload is thread-limited),
73 or because the NFS workload needs more CPU time than is available in
74 the thread pool (the workload is CPU-limited). In the former case,
75 configuring more nfsd threads will probably improve the performance
76 of the NFS workload. In the latter case, the sunrpc server layer is
77 already choosing not to wake idle nfsd threads because there are too
78 many nfsd threads which want to run but cannot, so configuring more
79 nfsd threads will make no difference whatsoever. The overloads-avoided
80 statistic (see below) can be used to distinguish these cases.
81
82threads-woken
83 Counts how many times an idle nfsd thread is woken to try to
84 receive some data from an NFS transport.
85
86 This statistic tracks the circumstance where incoming
87 network-facing NFS work is being handled quickly, which is a good
88 thing. The ideal rate of change for this counter will be close
89 to but less than the rate of change of the packets-arrived counter.
90
91overloads-avoided
92 Counts how many times the sunrpc server layer chose not to wake an
93 nfsd thread, despite the presence of idle nfsd threads, because
94 too many nfsd threads had been recently woken but could not get
95 enough CPU time to actually run.
96
97 This statistic counts a circumstance where the sunrpc layer
98 heuristically avoids overloading the CPU scheduler with too many
99 runnable nfsd threads. The ideal rate of change for this counter
100 is zero. Significant non-zero values indicate that the workload
101 is CPU limited. Usually this is associated with heavy CPU usage
102 on all the CPUs in the nfsd thread pool.
103
104 If a sustained large overloads-avoided rate is detected on a pool,
105 the top(1) utility should be used to check for the following
106 pattern of CPU usage on all the CPUs associated with the given
107 nfsd thread pool.
108
109 - %us ~= 0 (as you're *NOT* running applications on your NFS server)
110
111 - %wa ~= 0
112
113 - %id ~= 0
114
115 - %sy + %hi + %si ~= 100
116
117 If this pattern is seen, configuring more nfsd threads will *not*
118 improve the performance of the workload. If this patten is not
119 seen, then something more subtle is wrong.
120
121threads-timedout
122 Counts how many times an nfsd thread triggered an idle timeout,
123 i.e. was not woken to handle any incoming network packets for
124 some time.
125
126 This statistic counts a circumstance where there are more nfsd
127 threads configured than can be used by the NFS workload. This is
128 a clue that the number of nfsd threads can be reduced without
129 affecting performance. Unfortunately, it's only a clue and not
130 a strong indication, for a couple of reasons:
131
132 - Currently the rate at which the counter is incremented is quite
133 slow; the idle timeout is 60 minutes. Unless the NFS workload
134 remains constant for hours at a time, this counter is unlikely
135 to be providing information that is still useful.
136
137 - It is usually a wise policy to provide some slack,
138 i.e. configure a few more nfsds than are currently needed,
139 to allow for future spikes in load.
140
141
142Note that incoming packets on NFS transports will be dealt with in
143one of three ways. An nfsd thread can be woken (threads-woken counts
144this case), or the transport can be enqueued for later attention
145(sockets-enqueued counts this case), or the packet can be temporarily
146deferred because the transport is currently being used by an nfsd
147thread. This last case is not very interesting and is not explicitly
148counted, but can be inferred from the other counters thus:
149
150packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken )
151
152
153More
154----
155Descriptions of the other statistics file should go here.
156
157
158Greg Banks <gnb@sgi.com>
15926 Mar 2009
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt
new file mode 100644
index 000000000000..05d81cbcb2e1
--- /dev/null
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -0,0 +1,161 @@
1NFSv4.1 Server Implementation
2
3Server support for minorversion 1 can be controlled using the
4/proc/fs/nfsd/versions control file. The string output returned
5by reading this file will contain either "+4.1" or "-4.1"
6correspondingly.
7
8Currently, server support for minorversion 1 is disabled by default.
9It can be enabled at run time by writing the string "+4.1" to
10the /proc/fs/nfsd/versions control file. Note that to write this
11control file, the nfsd service must be taken down. Use your user-mode
12nfs-utils to set this up; see rpc.nfsd(8)
13
14The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
15on the latest NFSv4.1 Internet Draft:
16http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
17
18From the many new features in NFSv4.1 the current implementation
19focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
20"exactly once" semantics and better control and throttling of the
21resources allocated for each client.
22
23Other NFSv4.1 features, Parallel NFS operations in particular,
24are still under development out of tree.
25See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
26for more information.
27
28The table below, taken from the NFSv4.1 document, lists
29the operations that are mandatory to implement (REQ), optional
30(OPT), and NFSv4.0 operations that are required not to implement (MNI)
31in minor version 1. The first column indicates the operations that
32are not supported yet by the linux server implementation.
33
34The OPTIONAL features identified and their abbreviations are as follows:
35 pNFS Parallel NFS
36 FDELG File Delegations
37 DDELG Directory Delegations
38
39The following abbreviations indicate the linux server implementation status.
40 I Implemented NFSv4.1 operations.
41 NS Not Supported.
42 NS* unimplemented optional feature.
43 P pNFS features implemented out of tree.
44 PNS pNFS features that are not supported yet (out of tree).
45
46Operations
47
48 +----------------------+------------+--------------+----------------+
49 | Operation | REQ, REC, | Feature | Definition |
50 | | OPT, or | (REQ, REC, | |
51 | | MNI | or OPT) | |
52 +----------------------+------------+--------------+----------------+
53 | ACCESS | REQ | | Section 18.1 |
54NS | BACKCHANNEL_CTL | REQ | | Section 18.33 |
55NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 |
56 | CLOSE | REQ | | Section 18.2 |
57 | COMMIT | REQ | | Section 18.3 |
58 | CREATE | REQ | | Section 18.4 |
59I | CREATE_SESSION | REQ | | Section 18.36 |
60NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 |
61 | DELEGRETURN | OPT | FDELG, | Section 18.6 |
62 | | | DDELG, pNFS | |
63 | | | (REQ) | |
64NS | DESTROY_CLIENTID | REQ | | Section 18.50 |
65I | DESTROY_SESSION | REQ | | Section 18.37 |
66I | EXCHANGE_ID | REQ | | Section 18.35 |
67NS | FREE_STATEID | REQ | | Section 18.38 |
68 | GETATTR | REQ | | Section 18.7 |
69P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
70P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
71 | GETFH | REQ | | Section 18.8 |
72NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
73P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
74P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
75P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
76 | LINK | OPT | | Section 18.9 |
77 | LOCK | REQ | | Section 18.10 |
78 | LOCKT | REQ | | Section 18.11 |
79 | LOCKU | REQ | | Section 18.12 |
80 | LOOKUP | REQ | | Section 18.13 |
81 | LOOKUPP | REQ | | Section 18.14 |
82 | NVERIFY | REQ | | Section 18.15 |
83 | OPEN | REQ | | Section 18.16 |
84NS*| OPENATTR | OPT | | Section 18.17 |
85 | OPEN_CONFIRM | MNI | | N/A |
86 | OPEN_DOWNGRADE | REQ | | Section 18.18 |
87 | PUTFH | REQ | | Section 18.19 |
88 | PUTPUBFH | REQ | | Section 18.20 |
89 | PUTROOTFH | REQ | | Section 18.21 |
90 | READ | REQ | | Section 18.22 |
91 | READDIR | REQ | | Section 18.23 |
92 | READLINK | OPT | | Section 18.24 |
93NS | RECLAIM_COMPLETE | REQ | | Section 18.51 |
94 | RELEASE_LOCKOWNER | MNI | | N/A |
95 | REMOVE | REQ | | Section 18.25 |
96 | RENAME | REQ | | Section 18.26 |
97 | RENEW | MNI | | N/A |
98 | RESTOREFH | REQ | | Section 18.27 |
99 | SAVEFH | REQ | | Section 18.28 |
100 | SECINFO | REQ | | Section 18.29 |
101NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, |
102 | | | layout (REQ) | Section 13.12 |
103I | SEQUENCE | REQ | | Section 18.46 |
104 | SETATTR | REQ | | Section 18.30 |
105 | SETCLIENTID | MNI | | N/A |
106 | SETCLIENTID_CONFIRM | MNI | | N/A |
107NS | SET_SSV | REQ | | Section 18.47 |
108NS | TEST_STATEID | REQ | | Section 18.48 |
109 | VERIFY | REQ | | Section 18.31 |
110NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 |
111 | WRITE | REQ | | Section 18.32 |
112
113Callback Operations
114
115 +-------------------------+-----------+-------------+---------------+
116 | Operation | REQ, REC, | Feature | Definition |
117 | | OPT, or | (REQ, REC, | |
118 | | MNI | or OPT) | |
119 +-------------------------+-----------+-------------+---------------+
120 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
121P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
122NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
123P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
124NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
125NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
126 | CB_RECALL | OPT | FDELG, | Section 20.2 |
127 | | | DDELG, pNFS | |
128 | | | (REQ) | |
129NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 |
130 | | | DDELG, pNFS | |
131 | | | (REQ) | |
132NS | CB_RECALL_SLOT | REQ | | Section 20.8 |
133NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 |
134 | | | (REQ) | |
135I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 |
136 | | | DDELG, pNFS | |
137 | | | (REQ) | |
138NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
139 | | | DDELG, pNFS | |
140 | | | (REQ) | |
141 +-------------------------+-----------+-------------+---------------+
142
143Implementation notes:
144
145EXCHANGE_ID:
146* only SP4_NONE state protection supported
147* implementation ids are ignored
148
149CREATE_SESSION:
150* backchannel attributes are ignored
151* backchannel security parameters are ignored
152
153SEQUENCE:
154* no support for dynamic slot table renegotiation (optional)
155
156nfsv4.1 COMPOUND rules:
157The following cases aren't supported yet:
158* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION,
159 DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
160* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
161
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
new file mode 100644
index 000000000000..55c4300abfcb
--- /dev/null
+++ b/Documentation/filesystems/nilfs2.txt
@@ -0,0 +1,200 @@
1NILFS2
2------
3
4NILFS2 is a log-structured file system (LFS) supporting continuous
5snapshotting. In addition to versioning capability of the entire file
6system, users can even restore files mistakenly overwritten or
7destroyed just a few seconds ago. Since NILFS2 can keep consistency
8like conventional LFS, it achieves quick recovery after system
9crashes.
10
11NILFS2 creates a number of checkpoints every few seconds or per
12synchronous write basis (unless there is no change). Users can select
13significant versions among continuously created checkpoints, and can
14change them into snapshots which will be preserved until they are
15changed back to checkpoints.
16
17There is no limit on the number of snapshots until the volume gets
18full. Each snapshot is mountable as a read-only file system
19concurrently with its writable mount, and this feature is convenient
20for online backup.
21
22The userland tools are included in nilfs-utils package, which is
23available from the following download page. At least "mkfs.nilfs2",
24"mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called
25cleaner or garbage collector) are required. Details on the tools are
26described in the man pages included in the package.
27
28Project web page: http://www.nilfs.org/en/
29Download page: http://www.nilfs.org/en/download.html
30Git tree web page: http://www.nilfs.org/git/
31NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
32
33Caveats
34=======
35
36Features which NILFS2 does not support yet:
37
38 - atime
39 - extended attributes
40 - POSIX ACLs
41 - quotas
42 - writable snapshots
43 - remote backup (CDP)
44 - data integrity
45 - defragmentation
46
47Mount options
48=============
49
50NILFS2 supports the following mount options:
51(*) == default
52
53barrier=on(*) This enables/disables barriers. barrier=off disables
54 it, barrier=on enables it.
55errors=continue(*) Keep going on a filesystem error.
56errors=remount-ro Remount the filesystem read-only on an error.
57errors=panic Panic and halt the machine if an error occurs.
58cp=n Specify the checkpoint-number of the snapshot to be
59 mounted. Checkpoints and snapshots are listed by lscp
60 user command. Only the checkpoints marked as snapshot
61 are mountable with this option. Snapshot is read-only,
62 so a read-only mount option must be specified together.
63order=relaxed(*) Apply relaxed order semantics that allows modified data
64 blocks to be written to disk without making a
65 checkpoint if no metadata update is going. This mode
66 is equivalent to the ordered data mode of the ext3
67 filesystem except for the updates on data blocks still
68 conserve atomicity. This will improve synchronous
69 write performance for overwriting.
70order=strict Apply strict in-order semantics that preserves sequence
71 of all file operations including overwriting of data
72 blocks. That means, it is guaranteed that no
73 overtaking of events occurs in the recovered file
74 system after a crash.
75
76NILFS2 usage
77============
78
79To use nilfs2 as a local file system, simply:
80
81 # mkfs -t nilfs2 /dev/block_device
82 # mount -t nilfs2 /dev/block_device /dir
83
84This will also invoke the cleaner through the mount helper program
85(mount.nilfs2).
86
87Checkpoints and snapshots are managed by the following commands.
88Their manpages are included in the nilfs-utils package above.
89
90 lscp list checkpoints or snapshots.
91 mkcp make a checkpoint or a snapshot.
92 chcp change an existing checkpoint to a snapshot or vice versa.
93 rmcp invalidate specified checkpoint(s).
94
95To mount a snapshot,
96
97 # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir
98
99where <cno> is the checkpoint number of the snapshot.
100
101To unmount the NILFS2 mount point or snapshot, simply:
102
103 # umount /dir
104
105Then, the cleaner daemon is automatically shut down by the umount
106helper program (umount.nilfs2).
107
108Disk format
109===========
110
111A nilfs2 volume is equally divided into a number of segments except
112for the super block (SB) and segment #0. A segment is the container
113of logs. Each log is composed of summary information blocks, payload
114blocks, and an optional super root block (SR):
115
116 ______________________________________________________
117 | |SB| | Segment | Segment | Segment | ... | Segment | |
118 |_|__|_|____0____|____1____|____2____|_____|____N____|_|
119 0 +1K +4K +8M +16M +24M +(8MB x N)
120 . . (Typical offsets for 4KB-block)
121 . .
122 .______________________.
123 | log | log |... | log |
124 |__1__|__2__|____|__m__|
125 . .
126 . .
127 . .
128 .______________________________.
129 | Summary | Payload blocks |SR|
130 |_blocks__|_________________|__|
131
132The payload blocks are organized per file, and each file consists of
133data blocks and B-tree node blocks:
134
135 |<--- File-A --->|<--- File-B --->|
136 _______________________________________________________________
137 | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ...
138 _|_____________|_______________|_____________|_______________|_
139
140
141Since only the modified blocks are written in the log, it may have
142files without data blocks or B-tree node blocks.
143
144The organization of the blocks is recorded in the summary information
145blocks, which contains a header structure (nilfs_segment_summary), per
146file structures (nilfs_finfo), and per block structures (nilfs_binfo):
147
148 _________________________________________________________________________
149 | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |...
150 |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___
151
152
153The logs include regular files, directory files, symbolic link files
154and several meta data files. The mata data files are the files used
155to maintain file system meta data. The current version of NILFS2 uses
156the following meta data files:
157
158 1) Inode file (ifile) -- Stores on-disk inodes
159 2) Checkpoint file (cpfile) -- Stores checkpoints
160 3) Segment usage file (sufile) -- Stores allocation state of segments
161 4) Data address translation file -- Maps virtual block numbers to usual
162 (DAT) block numbers. This file serves to
163 make on-disk blocks relocatable.
164
165The following figure shows a typical organization of the logs:
166
167 _________________________________________________________________________
168 | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR|
169 |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__|
170
171
172To stride over segment boundaries, this sequence of files may be split
173into multiple logs. The sequence of logs that should be treated as
174logically one log, is delimited with flags marked in the segment
175summary. The recovery code of nilfs2 looks this boundary information
176to ensure atomicity of updates.
177
178The super root block is inserted for every checkpoints. It includes
179three special inodes, inodes for the DAT, cpfile, and sufile. Inodes
180of regular files, directories, symlinks and other special files, are
181included in the ifile. The inode of ifile itself is included in the
182corresponding checkpoint entry in the cpfile. Thus, the hierarchy
183among NILFS2 files can be depicted as follows:
184
185 Super block (SB)
186 |
187 v
188 Super root block (the latest cno=xx)
189 |-- DAT
190 |-- sufile
191 `-- cpfile
192 |-- ifile (cno=c1)
193 |-- ifile (cno=c2) ---- file (ino=i1)
194 : : |-- file (ino=i2)
195 `-- ifile (cno=xx) |-- file (ino=i3)
196 : :
197 `-- file (ino=yy)
198 ( regular file, directory, or symlink )
199
200For detail on the format of each file, please see include/linux/nilfs2_fs.h.
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt
new file mode 100644
index 000000000000..dcf833587162
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/design_notes.txt
@@ -0,0 +1,71 @@
1POHMELFS: Parallel Optimized Host Message Exchange Layered File System.
2
3 Evgeniy Polyakov <zbr@ioremap.net>
4
5Homepage: http://www.ioremap.net/projects/pohmelfs
6
7POHMELFS first began as a network filesystem with coherent local data and
8metadata caches but is now evolving into a parallel distributed filesystem.
9
10Main features of this FS include:
11 * Locally coherent cache for data and metadata with (potentially) byte-range locks.
12 Since all Linux filesystems lock the whole inode during writing, algorithm
13 is very simple and does not use byte-ranges, although they are sent in
14 locking messages.
15 * Completely async processing of all events except creation of hard and symbolic
16 links, and rename events.
17 Object creation and data reading and writing are processed asynchronously.
18 * Flexible object architecture optimized for network processing.
19 Ability to create long paths to objects and remove arbitrarily huge
20 directories with a single network command.
21 (like removing the whole kernel tree via a single network command).
22 * Very high performance.
23 * Fast and scalable multithreaded userspace server. Being in userspace it works
24 with any underlying filesystem and still is much faster than async in-kernel NFS one.
25 * Client is able to switch between different servers (if one goes down, client
26 automatically reconnects to second and so on).
27 * Transactions support. Full failover for all operations.
28 Resending transactions to different servers on timeout or error.
29 * Read request (data read, directory listing, lookup requests) balancing between multiple servers.
30 * Write requests are replicated to multiple servers and completed only when all of them are acked.
31 * Ability to add and/or remove servers from the working set at run-time.
32 * Strong authentification and possible data encryption in network channel.
33 * Extended attributes support.
34
35POHMELFS is based on transactions, which are potentially long-standing objects that live
36in the client's memory. Each transaction contains all the information needed to process a given
37command (or set of commands, which is frequently used during data writing: single transactions
38can contain creation and data writing commands). Transactions are committed by all the servers
39to which they are sent and, in case of failures, are eventually resent or dropped with an error.
40For example, reading will return an error if no servers are available.
41
42POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is
43possible to detach replies from requests and, if the command requires data to be received, the
44caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different
45servers and async threads will pick up replies in parallel, find appropriate transactions in the
46system and put the data where it belongs (like the page or inode cache).
47
48The main feature of POHMELFS is writeback data and the metadata cache.
49Only a few non-performance critical operations use the write-through cache and
50are synchronous: hard and symbolic link creation, and object rename. Creation,
51removal of objects and data writing are asynchronous and are sent to
52the server during system writeback. Only one writer at a time is allowed for any
53given inode, which is guarded by an appropriate locking protocol.
54Because of this feature, POHMELFS is extremely fast at metadata intensive
55workloads and can fully utilize the bandwidth to the servers when doing bulk
56data transfers.
57
58POHMELFS clients operate with a working set of servers and are capable of balancing read-only
59operations (like lookups or directory listings) between them according to IO priorities.
60Administrators can add or remove servers from the set at run-time via special commands (described
61in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected
62with write permission turned on. IO priority and permissions can be changed in run-time.
63
64POHMELFS is capable of full data channel encryption and/or strong crypto hashing.
65One can select any kernel supported cipher, encryption mode, hash type and operation mode
66(hmac or digest). It is also possible to use both or neither (default). Crypto configuration
67is checked during mount time and, if the server does not support it, appropriate capabilities
68will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified).
69Crypto performance heavily depends on the number of crypto threads, which asynchronously perform
70crypto operations and send the resulting data to server or submit it up the stack. This number
71can be controlled via a mount option.
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt
new file mode 100644
index 000000000000..db2e41393626
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/info.txt
@@ -0,0 +1,99 @@
1POHMELFS usage information.
2
3Mount options.
4All but index, number of crypto threads and maximum IO size can changed via remount.
5
6idx=%u
7 Each mountpoint is associated with a special index via this option.
8 Administrator can add or remove servers from the given index, so all mounts,
9 which were attached to it, are updated.
10 Default it is 0.
11
12trans_scan_timeout=%u
13 This timeout, expressed in milliseconds, specifies time to scan transaction
14 trees looking for stale requests, which have to be resent, or if number of
15 retries exceed specified limit, dropped with error.
16 Default is 5 seconds.
17
18drop_scan_timeout=%u
19 Internal timeout, expressed in milliseconds, which specifies how frequently
20 inodes marked to be dropped are freed. It also specifies how frequently
21 the system checks that servers have to be added or removed from current working set.
22 Default is 1 second.
23
24wait_on_page_timeout=%u
25 Number of milliseconds to wait for reply from remote server for data reading command.
26 If this timeout is exceeded, reading returns an error.
27 Default is 5 seconds.
28
29trans_retries=%u
30 This is the number of times that a transaction will be resent to a server that did
31 not answer for the last @trans_scan_timeout milliseconds.
32 When the number of resends exceeds this limit, the transaction is completed with error.
33 Default is 5 resends.
34
35crypto_thread_num=%u
36 Number of crypto processing threads. Threads are used both for RX and TX traffic.
37 Default is 2, or no threads if crypto operations are not supported.
38
39trans_max_pages=%u
40 Maximum number of pages in a single transaction. This parameter also controls
41 the number of pages, allocated for crypto processing (each crypto thread has
42 pool of pages, the number of which is equal to 'trans_max_pages'.
43 Default is 100 pages.
44
45crypto_fail_unsupported
46 If specified, mount will fail if the server does not support requested crypto operations.
47 By default mount will disable non-matching crypto operations.
48
49mcache_timeout=%u
50 Maximum number of milliseconds to wait for the mcache objects to be processed.
51 Mcache includes locks (given lock should be granted by server), attributes (they should be
52 fully received in the given timeframe).
53 Default is 5 seconds.
54
55Usage examples.
56
57Add server server1.net:1025 into the working set with index $idx
58with appropriate hash algorithm and key file and cipher algorithm, mode and key file:
59$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key
60
61Mount filesystem with given index $idx to /mnt mountpoint.
62Client will connect to all servers specified in the working set via previous command:
63mount -t pohmel -o idx=$idx q /mnt
64
65Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw):
66$cfg A modify -a server1.net -p 1025 -i $idx -I 1
67
68Change IO priority to 123 (node with the highest priority gets read requests).
69$cfg A modify -a server1.net -p 1025 -i $idx -P 123
70
71One can check currect status of all connections in the mountstats file:
72# cat /proc/$PID/mountstats
73...
74device none mounted on /mnt with fstype pohmel
75idx addr(:port) socket_type protocol active priority permissions
760 server1.net:1026 1 6 1 250 1
770 server2.net:1025 1 6 1 123 3
78
79Server installation.
80
81Creating a server, which listens at port 1025 and 0.0.0.0 address.
82Working root directory (note, that server chroots there, so you have to have appropriate permissions)
83is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there
84are appropriate key files.
85Number of working threads is set to 10.
86
87# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key
88
89 -A 6 - listen on ipv6 address. Default: Disabled.
90 -r root - path to root directory. Default: /tmp.
91 -a addr - listen address. Default: 0.0.0.0.
92 -p port - listen port. Default: 1025.
93 -w workers - number of workers per connected client. Default: 1.
94 -K file - hash key size. Default: none.
95 -k file - cipher key size. Default: none.
96 -h - this help.
97
98Number of worker threads specifies how many workers will be created for each client.
99Bulk single-client transafers usually are better handled with smaller number (like 1-3).
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt
new file mode 100644
index 000000000000..40ea6c295afb
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/network_protocol.txt
@@ -0,0 +1,227 @@
1POHMELFS network protocol.
2
3Basic structure used in network communication is following command:
4
5struct netfs_cmd
6{
7 __u16 cmd; /* Command number */
8 __u16 csize; /* Attached crypto information size */
9 __u16 cpad; /* Attached padding size */
10 __u16 ext; /* External flags */
11 __u32 size; /* Size of the attached data */
12 __u32 trans; /* Transaction id */
13 __u64 id; /* Object ID to operate on. Used for feedback.*/
14 __u64 start; /* Start of the object. */
15 __u64 iv; /* IV sequence */
16 __u8 data[0];
17};
18
19Commands can be embedded into transaction command (which in turn has own command),
20so one can extend protocol as needed without breaking backward compatibility as long
21as old commands are supported. All string lengths include tail 0 byte.
22
23All commans are transfered over the network in big-endian. CPU endianess is used at the end peers.
24
25@cmd - command number, which specifies command to be processed. Following
26 commands are used currently:
27
28 NETFS_READDIR = 1, /* Read directory for given inode number */
29 NETFS_READ_PAGE, /* Read data page from the server */
30 NETFS_WRITE_PAGE, /* Write data page to the server */
31 NETFS_CREATE, /* Create directory entry */
32 NETFS_REMOVE, /* Remove directory entry */
33 NETFS_LOOKUP, /* Lookup single object */
34 NETFS_LINK, /* Create a link */
35 NETFS_TRANS, /* Transaction */
36 NETFS_OPEN, /* Open intent */
37 NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */
38 NETFS_PAGE_CACHE, /* Page cache invalidation message */
39 NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */
40 NETFS_RENAME, /* Rename object */
41 NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */
42 NETFS_LOCK, /* Distributed lock message */
43 NETFS_XATTR_SET, /* Set extended attribute */
44 NETFS_XATTR_GET, /* Get extended attribute */
45
46@ext - external flags. Used by different commands to specify some extra arguments
47 like partial size of the embedded objects or creation flags.
48
49@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached,
50 but size of the requested data is incorporated here. It does not include size of the command
51 header (struct netfs_cmd) itself.
52
53@id - id of the object this command operates on. Each command can use it for own purpose.
54
55@start - start of the object this command operates on. Each command can use it for own purpose.
56
57@csize, @cpad - size and padding size of the (attached if needed) crypto information.
58
59Command specifications.
60
61@NETFS_READDIR
62This command is used to sync content of the remote dir to the client.
63
64@ext - length of the path to object.
65@size - the same.
66@id - local inode number of the directory to read.
67@start - zero.
68
69
70@NETFS_READ_PAGE
71This command is used to read data from remote server.
72Data size does not exceed local page cache size.
73
74@id - inode number.
75@start - first byte offset.
76@size - number of bytes to read plus length of the path to object.
77@ext - object path length.
78
79
80@NETFS_CREATE
81Used to create object.
82It does not require that all directories on top of the object were
83already created, it will create them automatically. Each object has
84associated @netfs_path_entry data structure, which contains creation
85mode (permissions and type) and length of the name as long as name itself.
86
87@start - 0
88@size - size of the all data structures needed to create a path
89@id - local inode number
90@ext - 0
91
92
93@NETFS_REMOVE
94Used to remove object.
95
96@ext - length of the path to object.
97@size - the same.
98@id - local inode number.
99@start - zero.
100
101
102@NETFS_LOOKUP
103Lookup information about object on server.
104
105@ext - length of the path to object.
106@size - the same.
107@id - local inode number of the directory to look object in.
108@start - local inode number of the object to look at.
109
110
111@NETFS_LINK
112Create hard of symlink.
113Command is sent as "object_path|target_path".
114
115@size - size of the above string.
116@id - parent local inode number.
117@start - 1 for symlink, 0 for hardlink.
118@ext - size of the "object_path" above.
119
120
121@NETFS_TRANS
122Transaction header.
123
124@size - incorporates all embedded command sizes including theirs header sizes.
125@start - transaction generation number - unique id used to find transaction.
126@ext - transaction flags. Unused at the moment.
127@id - 0.
128
129
130@NETFS_OPEN
131Open intent for given transaction.
132
133@id - local inode number.
134@start - 0.
135@size - path length to the object.
136@ext - open flags (O_RDWR and so on).
137
138
139@NETFS_INODE_INFO
140Metadata update command.
141It is sent to servers when attributes of the object are changed and received
142when data or metadata were updated. It operates with the following structure:
143
144struct netfs_inode_info
145{
146 unsigned int mode;
147 unsigned int nlink;
148 unsigned int uid;
149 unsigned int gid;
150 unsigned int blocksize;
151 unsigned int padding;
152 __u64 ino;
153 __u64 blocks;
154 __u64 rdev;
155 __u64 size;
156 __u64 version;
157};
158
159It effectively mirrors stat(2) returned data.
160
161
162@ext - path length to the object.
163@size - the same plus size of the netfs_inode_info structure.
164@id - local inode number.
165@start - 0.
166
167
168@NETFS_PAGE_CACHE
169Command is only received by clients. It contains information about
170page to be marked as not up-to-date.
171
172@id - client's inode number.
173@start - last byte of the page to be invalidated. If it is not equal to
174 current inode size, it will be vmtruncated().
175@size - 0
176@ext - 0
177
178
179@NETFS_READ_PAGES
180Used to read multiple contiguous pages in one go.
181
182@start - first byte of the contiguous region to read.
183@size - contains of two fields: lower 8 bits are used to represent page cache shift
184 used by client, another 3 bytes are used to get number of pages.
185@id - local inode number.
186@ext - path length to the object.
187
188
189@NETFS_RENAME
190Used to rename object.
191Attached data is formed into following string: "old_path|new_path".
192
193@id - local inode number.
194@start - parent inode number.
195@size - length of the above string.
196@ext - length of the old path part.
197
198
199@NETFS_CAPABILITIES
200Used to exchange crypto capabilities with server.
201If crypto capabilities are not supported by server, then client will disable it
202or fail (if 'crypto_fail_unsupported' mount options was specified).
203
204@id - superblock index. Used to specify crypto information for group of servers.
205@size - size of the attached capabilities structure.
206@start - 0.
207@size - 0.
208@scsize - 0.
209
210@NETFS_LOCK
211Used to send lock request/release messages. Although it sends byte range request
212and is capable of flushing pages based on that, it is not used, since all Linux
213filesystems lock the whole inode.
214
215@id - lock generation number.
216@start - start of the locked range.
217@size - size of the locked range.
218@ext - lock type: read/write. Not used actually. 15'th bit is used to determine,
219 if it is lock request (1) or release (0).
220
221@NETFS_XATTR_SET
222@NETFS_XATTR_GET
223Used to set/get extended attributes for given inode.
224@id - attribute generation number or xattr setting type
225@start - size of the attribute (request or attached)
226@size - name length, path len and data size for given attribute
227@ext - path length for given object
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a87be42f8211..ce84cfc9eae0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -5,6 +5,7 @@
5 Bodo Bauer <bb@ricochet.net> 5 Bodo Bauer <bb@ricochet.net>
6 6
72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000
8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009
8------------------------------------------------------------------------------ 9------------------------------------------------------------------------------
9Version 1.3 Kernel version 2.2.12 10Version 1.3 Kernel version 2.2.12
10 Kernel version 2.4.0-test11-pre4 11 Kernel version 2.4.0-test11-pre4
@@ -26,25 +27,17 @@ Table of Contents
26 1.6 Parallel port info in /proc/parport 27 1.6 Parallel port info in /proc/parport
27 1.7 TTY info in /proc/tty 28 1.7 TTY info in /proc/tty
28 1.8 Miscellaneous kernel statistics in /proc/stat 29 1.8 Miscellaneous kernel statistics in /proc/stat
30 1.9 Ext4 file system parameters
29 31
30 2 Modifying System Parameters 32 2 Modifying System Parameters
31 2.1 /proc/sys/fs - File system data 33
32 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats 34 3 Per-Process Parameters
33 2.3 /proc/sys/kernel - general kernel parameters 35 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
34 2.4 /proc/sys/vm - The virtual memory subsystem 36 3.2 /proc/<pid>/oom_score - Display current oom-killer score
35 2.5 /proc/sys/dev - Device specific parameters 37 3.3 /proc/<pid>/io - Display the IO accounting fields
36 2.6 /proc/sys/sunrpc - Remote procedure calls 38 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
37 2.7 /proc/sys/net - Networking stuff 39 3.5 /proc/<pid>/mountinfo - Information about mounts
38 2.8 /proc/sys/net/ipv4 - IPV4 settings 40
39 2.9 Appletalk
40 2.10 IPX
41 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
42 2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score
43 2.13 /proc/<pid>/oom_score - Display current oom-killer score
44 2.14 /proc/<pid>/io - Display the IO accounting fields
45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
46 2.16 /proc/<pid>/mountinfo - Information about mounts
47 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
48 41
49------------------------------------------------------------------------------ 42------------------------------------------------------------------------------
50Preface 43Preface
@@ -940,27 +933,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
940 File Content 933 File Content
941 mb_groups details of multiblock allocator buddy cache of free blocks 934 mb_groups details of multiblock allocator buddy cache of free blocks
942 mb_history multiblock allocation history 935 mb_history multiblock allocation history
943 stats controls whether the multiblock allocator should start
944 collecting statistics, which are shown during the unmount
945 group_prealloc the multiblock allocator will round up allocation
946 requests to a multiple of this tuning parameter if the
947 stripe size is not set in the ext4 superblock
948 max_to_scan The maximum number of extents the multiblock allocator
949 will search to find the best extent
950 min_to_scan The minimum number of extents the multiblock allocator
951 will search to find the best extent
952 order2_req Tuning parameter which controls the minimum size for
953 requests (as a power of 2) where the buddy cache is
954 used
955 stream_req Files which have fewer blocks than this tunable
956 parameter will have their blocks allocated out of a
957 block group specific preallocation pool, so that small
958 files are packed closely together. Each large file
959 will have its blocks allocated out of its own unique
960 preallocation pool.
961inode_readahead Tuning parameter which controls the maximum number of
962 inode table blocks that ext4's inode table readahead
963 algorithm will pre-read into the buffer cache
964.............................................................................. 936..............................................................................
965 937
966 938
@@ -1011,1014 +983,24 @@ review the kernel documentation in the directory /usr/src/linux/Documentation.
1011This chapter is heavily based on the documentation included in the pre 2.2 983This chapter is heavily based on the documentation included in the pre 2.2
1012kernels, and became part of it in version 2.2.1 of the Linux kernel. 984kernels, and became part of it in version 2.2.1 of the Linux kernel.
1013 985
10142.1 /proc/sys/fs - File system data 986Please see: Documentation/sysctls/ directory for descriptions of these
1015-----------------------------------
1016
1017This subdirectory contains specific file system, file handle, inode, dentry
1018and quota information.
1019
1020Currently, these files are in /proc/sys/fs:
1021
1022dentry-state
1023------------
1024
1025Status of the directory cache. Since directory entries are dynamically
1026allocated and deallocated, this file indicates the current status. It holds
1027six values, in which the last two are not used and are always zero. The others
1028are listed in table 2-1.
1029
1030
1031Table 2-1: Status files of the directory cache
1032..............................................................................
1033 File Content
1034 nr_dentry Almost always zero
1035 nr_unused Number of unused cache entries
1036 age_limit
1037 in seconds after the entry may be reclaimed, when memory is short
1038 want_pages internally
1039..............................................................................
1040
1041dquot-nr and dquot-max
1042----------------------
1043
1044The file dquot-max shows the maximum number of cached disk quota entries.
1045
1046The file dquot-nr shows the number of allocated disk quota entries and the
1047number of free disk quota entries.
1048
1049If the number of available cached disk quotas is very low and you have a large
1050number of simultaneous system users, you might want to raise the limit.
1051
1052file-nr and file-max
1053--------------------
1054
1055The kernel allocates file handles dynamically, but doesn't free them again at
1056this time.
1057
1058The value in file-max denotes the maximum number of file handles that the
1059Linux kernel will allocate. When you get a lot of error messages about running
1060out of file handles, you might want to raise this limit. The default value is
106110% of RAM in kilobytes. To change it, just write the new number into the
1062file:
1063
1064 # cat /proc/sys/fs/file-max
1065 4096
1066 # echo 8192 > /proc/sys/fs/file-max
1067 # cat /proc/sys/fs/file-max
1068 8192
1069
1070
1071This method of revision is useful for all customizable parameters of the
1072kernel - simply echo the new value to the corresponding file.
1073
1074Historically, the three values in file-nr denoted the number of allocated file
1075handles, the number of allocated but unused file handles, and the maximum
1076number of file handles. Linux 2.6 always reports 0 as the number of free file
1077handles -- this is not an error, it just means that the number of allocated
1078file handles exactly matches the number of used file handles.
1079
1080Attempts to allocate more file descriptors than file-max are reported with
1081printk, look for "VFS: file-max limit <number> reached".
1082
1083inode-state and inode-nr
1084------------------------
1085
1086The file inode-nr contains the first two items from inode-state, so we'll skip
1087to that file...
1088
1089inode-state contains two actual numbers and five dummy values. The numbers
1090are nr_inodes and nr_free_inodes (in order of appearance).
1091
1092nr_inodes
1093~~~~~~~~~
1094
1095Denotes the number of inodes the system has allocated. This number will
1096grow and shrink dynamically.
1097
1098nr_open
1099-------
1100
1101Denotes the maximum number of file-handles a process can
1102allocate. Default value is 1024*1024 (1048576) which should be
1103enough for most machines. Actual limit depends on RLIMIT_NOFILE
1104resource limit.
1105
1106nr_free_inodes
1107--------------
1108
1109Represents the number of free inodes. Ie. The number of inuse inodes is
1110(nr_inodes - nr_free_inodes).
1111
1112aio-nr and aio-max-nr
1113---------------------
1114
1115aio-nr is the running total of the number of events specified on the
1116io_setup system call for all currently active aio contexts. If aio-nr
1117reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
1118raising aio-max-nr does not result in the pre-allocation or re-sizing
1119of any kernel data structures.
1120
11212.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
1122-----------------------------------------------------------
1123
1124Besides these files, there is the subdirectory /proc/sys/fs/binfmt_misc. This
1125handles the kernel support for miscellaneous binary formats.
1126
1127Binfmt_misc provides the ability to register additional binary formats to the
1128Kernel without compiling an additional module/kernel. Therefore, binfmt_misc
1129needs to know magic numbers at the beginning or the filename extension of the
1130binary.
1131
1132It works by maintaining a linked list of structs that contain a description of
1133a binary format, including a magic with size (or the filename extension),
1134offset and mask, and the interpreter name. On request it invokes the given
1135interpreter with the original program as argument, as binfmt_java and
1136binfmt_em86 and binfmt_mz do. Since binfmt_misc does not define any default
1137binary-formats, you have to register an additional binary-format.
1138
1139There are two general files in binfmt_misc and one file per registered format.
1140The two general files are register and status.
1141
1142Registering a new binary format
1143-------------------------------
1144
1145To register a new binary format you have to issue the command
1146
1147 echo :name:type:offset:magic:mask:interpreter: > /proc/sys/fs/binfmt_misc/register
1148
1149
1150
1151with appropriate name (the name for the /proc-dir entry), offset (defaults to
11520, if omitted), magic, mask (which can be omitted, defaults to all 0xff) and
1153last but not least, the interpreter that is to be invoked (for example and
1154testing /bin/echo). Type can be M for usual magic matching or E for filename
1155extension matching (give extension in place of magic).
1156
1157Check or reset the status of the binary format handler
1158------------------------------------------------------
1159
1160If you do a cat on the file /proc/sys/fs/binfmt_misc/status, you will get the
1161current status (enabled/disabled) of binfmt_misc. Change the status by echoing
11620 (disables) or 1 (enables) or -1 (caution: this clears all previously
1163registered binary formats) to status. For example echo 0 > status to disable
1164binfmt_misc (temporarily).
1165
1166Status of a single handler
1167--------------------------
1168
1169Each registered handler has an entry in /proc/sys/fs/binfmt_misc. These files
1170perform the same function as status, but their scope is limited to the actual
1171binary format. By cating this file, you also receive all related information
1172about the interpreter/magic of the binfmt.
1173
1174Example usage of binfmt_misc (emulate binfmt_java)
1175--------------------------------------------------
1176
1177 cd /proc/sys/fs/binfmt_misc
1178 echo ':Java:M::\xca\xfe\xba\xbe::/usr/local/java/bin/javawrapper:' > register
1179 echo ':HTML:E::html::/usr/local/java/bin/appletviewer:' > register
1180 echo ':Applet:M::<!--applet::/usr/local/java/bin/appletviewer:' > register
1181 echo ':DEXE:M::\x0eDEX::/usr/bin/dosexec:' > register
1182
1183
1184These four lines add support for Java executables and Java applets (like
1185binfmt_java, additionally recognizing the .html extension with no need to put
1186<!--applet> to every applet file). You have to install the JDK and the
1187shell-script /usr/local/java/bin/javawrapper too. It works around the
1188brokenness of the Java filename handling. To add a Java binary, just create a
1189link to the class-file somewhere in the path.
1190
11912.3 /proc/sys/kernel - general kernel parameters
1192------------------------------------------------
1193
1194This directory reflects general kernel behaviors. As I've said before, the
1195contents depend on your configuration. Here you'll find the most important
1196files, along with descriptions of what they mean and how to use them.
1197
1198acct
1199----
1200
1201The file contains three values; highwater, lowwater, and frequency.
1202
1203It exists only when BSD-style process accounting is enabled. These values
1204control its behavior. If the free space on the file system where the log lives
1205goes below lowwater percentage, accounting suspends. If it goes above
1206highwater percentage, accounting resumes. Frequency determines how often you
1207check the amount of free space (value is in seconds). Default settings are: 4,
12082, and 30. That is, suspend accounting if there is less than 2 percent free;
1209resume it if we have a value of 3 or more percent; consider information about
1210the amount of free space valid for 30 seconds
1211
1212ctrl-alt-del
1213------------
1214
1215When the value in this file is 0, ctrl-alt-del is trapped and sent to the init
1216program to handle a graceful restart. However, when the value is greater that
1217zero, Linux's reaction to this key combination will be an immediate reboot,
1218without syncing its dirty buffers.
1219
1220[NOTE]
1221 When a program (like dosemu) has the keyboard in raw mode, the
1222 ctrl-alt-del is intercepted by the program before it ever reaches the
1223 kernel tty layer, and it is up to the program to decide what to do with
1224 it.
1225
1226domainname and hostname
1227-----------------------
1228
1229These files can be controlled to set the NIS domainname and hostname of your
1230box. For the classic darkstar.frop.org a simple:
1231
1232 # echo "darkstar" > /proc/sys/kernel/hostname
1233 # echo "frop.org" > /proc/sys/kernel/domainname
1234
1235
1236would suffice to set your hostname and NIS domainname.
1237
1238osrelease, ostype and version
1239-----------------------------
1240
1241The names make it pretty obvious what these fields contain:
1242
1243 > cat /proc/sys/kernel/osrelease
1244 2.2.12
1245
1246 > cat /proc/sys/kernel/ostype
1247 Linux
1248
1249 > cat /proc/sys/kernel/version
1250 #4 Fri Oct 1 12:41:14 PDT 1999
1251
1252
1253The files osrelease and ostype should be clear enough. Version needs a little
1254more clarification. The #4 means that this is the 4th kernel built from this
1255source base and the date after it indicates the time the kernel was built. The
1256only way to tune these values is to rebuild the kernel.
1257
1258panic
1259-----
1260
1261The value in this file represents the number of seconds the kernel waits
1262before rebooting on a panic. When you use the software watchdog, the
1263recommended setting is 60. If set to 0, the auto reboot after a kernel panic
1264is disabled, which is the default setting.
1265
1266printk
1267------
1268
1269The four values in printk denote
1270* console_loglevel,
1271* default_message_loglevel,
1272* minimum_console_loglevel and
1273* default_console_loglevel
1274respectively.
1275
1276These values influence printk() behavior when printing or logging error
1277messages, which come from inside the kernel. See syslog(2) for more
1278information on the different log levels.
1279
1280console_loglevel
1281----------------
1282
1283Messages with a higher priority than this will be printed to the console.
1284
1285default_message_level
1286---------------------
1287
1288Messages without an explicit priority will be printed with this priority.
1289
1290minimum_console_loglevel
1291------------------------
1292
1293Minimum (highest) value to which the console_loglevel can be set.
1294
1295default_console_loglevel
1296------------------------
1297
1298Default value for console_loglevel.
1299
1300sg-big-buff
1301-----------
1302
1303This file shows the size of the generic SCSI (sg) buffer. At this point, you
1304can't tune it yet, but you can change it at compile time by editing
1305include/scsi/sg.h and changing the value of SG_BIG_BUFF.
1306
1307If you use a scanner with SANE (Scanner Access Now Easy) you might want to set
1308this to a higher value. Refer to the SANE documentation on this issue.
1309
1310modprobe
1311--------
1312
1313The location where the modprobe binary is located. The kernel uses this
1314program to load modules on demand.
1315
1316unknown_nmi_panic
1317-----------------
1318
1319The value in this file affects behavior of handling NMI. When the value is
1320non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
1321debugging information is displayed on console.
1322
1323NMI switch that most IA32 servers have fires unknown NMI up, for example.
1324If a system hangs up, try pressing the NMI switch.
1325
1326panic_on_unrecovered_nmi
1327------------------------
1328
1329The default Linux behaviour on an NMI of either memory or unknown is to continue
1330operation. For many environments such as scientific computing it is preferable
1331that the box is taken out and the error dealt with than an uncorrected
1332parity/ECC error get propogated.
1333
1334A small number of systems do generate NMI's for bizarre random reasons such as
1335power management so the default is off. That sysctl works like the existing
1336panic controls already in that directory.
1337
1338nmi_watchdog
1339------------
1340
1341Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
1342the NMI watchdog is enabled and will continuously test all online cpus to
1343determine whether or not they are still functioning properly. Currently,
1344passing "nmi_watchdog=" parameter at boot time is required for this function
1345to work.
1346
1347If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the
1348NMI watchdog shares registers with oprofile. By disabling the NMI watchdog,
1349oprofile may have more registers to utilize.
1350
1351msgmni
1352------
1353
1354Maximum number of message queue ids on the system.
1355This value scales to the amount of lowmem. It is automatically recomputed
1356upon memory add/remove or ipc namespace creation/removal.
1357When a value is written into this file, msgmni's value becomes fixed, i.e. it
1358is not recomputed anymore when one of the above events occurs.
1359Use auto_msgmni to change this behavior.
1360
1361auto_msgmni
1362-----------
1363
1364Enables/Disables automatic recomputing of msgmni upon memory add/remove or
1365upon ipc namespace creation/removal (see the msgmni description above).
1366Echoing "1" into this file enables msgmni automatic recomputing.
1367Echoing "0" turns it off.
1368auto_msgmni default value is 1.
1369
1370
13712.4 /proc/sys/vm - The virtual memory subsystem
1372-----------------------------------------------
1373
1374Please see: Documentation/sysctls/vm.txt for a description of these
1375entries. 987entries.
1376 988
989------------------------------------------------------------------------------
990Summary
991------------------------------------------------------------------------------
992Certain aspects of kernel behavior can be modified at runtime, without the
993need to recompile the kernel, or even to reboot the system. The files in the
994/proc/sys tree can not only be read, but also modified. You can use the echo
995command to write value into these files, thereby changing the default settings
996of the kernel.
997------------------------------------------------------------------------------
1377 998
13782.5 /proc/sys/dev - Device specific parameters 999------------------------------------------------------------------------------
1379---------------------------------------------- 1000CHAPTER 3: PER-PROCESS PARAMETERS
1380 1001------------------------------------------------------------------------------
1381Currently there is only support for CDROM drives, and for those, there is only
1382one read-only file containing information about the CD-ROM drives attached to
1383the system:
1384
1385 >cat /proc/sys/dev/cdrom/info
1386 CD-ROM information, Id: cdrom.c 2.55 1999/04/25
1387
1388 drive name: sr0 hdb
1389 drive speed: 32 40
1390 drive # of slots: 1 0
1391 Can close tray: 1 1
1392 Can open tray: 1 1
1393 Can lock tray: 1 1
1394 Can change speed: 1 1
1395 Can select disk: 0 1
1396 Can read multisession: 1 1
1397 Can read MCN: 1 1
1398 Reports media changed: 1 1
1399 Can play audio: 1 1
1400
1401
1402You see two drives, sr0 and hdb, along with a list of their features.
1403
14042.6 /proc/sys/sunrpc - Remote procedure calls
1405---------------------------------------------
1406
1407This directory contains four files, which enable or disable debugging for the
1408RPC functions NFS, NFS-daemon, RPC and NLM. The default values are 0. They can
1409be set to one to turn debugging on. (The default value is 0 for each)
1410
14112.7 /proc/sys/net - Networking stuff
1412------------------------------------
1413
1414The interface to the networking parts of the kernel is located in
1415/proc/sys/net. Table 2-3 shows all possible subdirectories. You may see only
1416some of them, depending on your kernel's configuration.
1417
1418
1419Table 2-3: Subdirectories in /proc/sys/net
1420..............................................................................
1421 Directory Content Directory Content
1422 core General parameter appletalk Appletalk protocol
1423 unix Unix domain sockets netrom NET/ROM
1424 802 E802 protocol ax25 AX25
1425 ethernet Ethernet protocol rose X.25 PLP layer
1426 ipv4 IP version 4 x25 X.25 protocol
1427 ipx IPX token-ring IBM token ring
1428 bridge Bridging decnet DEC net
1429 ipv6 IP version 6
1430..............................................................................
1431
1432We will concentrate on IP networking here. Since AX15, X.25, and DEC Net are
1433only minor players in the Linux world, we'll skip them in this chapter. You'll
1434find some short info on Appletalk and IPX further on in this chapter. Review
1435the online documentation and the kernel source to get a detailed view of the
1436parameters for those protocols. In this section we'll discuss the
1437subdirectories printed in bold letters in the table above. As default values
1438are suitable for most needs, there is no need to change these values.
1439
1440/proc/sys/net/core - Network core options
1441-----------------------------------------
1442
1443rmem_default
1444------------
1445
1446The default setting of the socket receive buffer in bytes.
1447
1448rmem_max
1449--------
1450
1451The maximum receive socket buffer size in bytes.
1452
1453wmem_default
1454------------
1455
1456The default setting (in bytes) of the socket send buffer.
1457
1458wmem_max
1459--------
1460
1461The maximum send socket buffer size in bytes.
1462
1463message_burst and message_cost
1464------------------------------
1465
1466These parameters are used to limit the warning messages written to the kernel
1467log from the networking code. They enforce a rate limit to make a
1468denial-of-service attack impossible. A higher message_cost factor, results in
1469fewer messages that will be written. Message_burst controls when messages will
1470be dropped. The default settings limit warning messages to one every five
1471seconds.
1472
1473warnings
1474--------
1475
1476This controls console messages from the networking stack that can occur because
1477of problems on the network like duplicate address or bad checksums. Normally,
1478this should be enabled, but if the problem persists the messages can be
1479disabled.
1480
1481
1482netdev_max_backlog
1483------------------
1484
1485Maximum number of packets, queued on the INPUT side, when the interface
1486receives packets faster than kernel can process them.
1487
1488optmem_max
1489----------
1490
1491Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
1492of struct cmsghdr structures with appended data.
1493
1494/proc/sys/net/unix - Parameters for Unix domain sockets
1495-------------------------------------------------------
1496
1497There are only two files in this subdirectory. They control the delays for
1498deleting and destroying socket descriptors.
1499
15002.8 /proc/sys/net/ipv4 - IPV4 settings
1501--------------------------------------
1502
1503IP version 4 is still the most used protocol in Unix networking. It will be
1504replaced by IP version 6 in the next couple of years, but for the moment it's
1505the de facto standard for the internet and is used in most networking
1506environments around the world. Because of the importance of this protocol,
1507we'll have a deeper look into the subtree controlling the behavior of the IPv4
1508subsystem of the Linux kernel.
1509
1510Let's start with the entries in /proc/sys/net/ipv4.
1511
1512ICMP settings
1513-------------
1514
1515icmp_echo_ignore_all and icmp_echo_ignore_broadcasts
1516----------------------------------------------------
1517
1518Turn on (1) or off (0), if the kernel should ignore all ICMP ECHO requests, or
1519just those to broadcast and multicast addresses.
1520
1521Please note that if you accept ICMP echo requests with a broadcast/multi\-cast
1522destination address your network may be used as an exploder for denial of
1523service packet flooding attacks to other hosts.
1524
1525icmp_destunreach_rate, icmp_echoreply_rate, icmp_paramprob_rate and icmp_timeexeed_rate
1526---------------------------------------------------------------------------------------
1527
1528Sets limits for sending ICMP packets to specific targets. A value of zero
1529disables all limiting. Any positive value sets the maximum package rate in
1530hundredth of a second (on Intel systems).
1531
1532IP settings
1533-----------
1534
1535ip_autoconfig
1536-------------
1537
1538This file contains the number one if the host received its IP configuration by
1539RARP, BOOTP, DHCP or a similar mechanism. Otherwise it is zero.
1540
1541ip_default_ttl
1542--------------
1543
1544TTL (Time To Live) for IPv4 interfaces. This is simply the maximum number of
1545hops a packet may travel.
1546
1547ip_dynaddr
1548----------
1549
1550Enable dynamic socket address rewriting on interface address change. This is
1551useful for dialup interface with changing IP addresses.
1552
1553ip_forward
1554----------
1555
1556Enable or disable forwarding of IP packages between interfaces. Changing this
1557value resets all other parameters to their default values. They differ if the
1558kernel is configured as host or router.
1559
1560ip_local_port_range
1561-------------------
1562
1563Range of ports used by TCP and UDP to choose the local port. Contains two
1564numbers, the first number is the lowest port, the second number the highest
1565local port. Default is 1024-4999. Should be changed to 32768-61000 for
1566high-usage systems.
1567
1568ip_no_pmtu_disc
1569---------------
1570
1571Global switch to turn path MTU discovery off. It can also be set on a per
1572socket basis by the applications or on a per route basis.
1573
1574ip_masq_debug
1575-------------
1576
1577Enable/disable debugging of IP masquerading.
1578
1579IP fragmentation settings
1580-------------------------
1581
1582ipfrag_high_trash and ipfrag_low_trash
1583--------------------------------------
1584
1585Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh bytes
1586of memory is allocated for this purpose, the fragment handler will toss
1587packets until ipfrag_low_thresh is reached.
1588
1589ipfrag_time
1590-----------
1591
1592Time in seconds to keep an IP fragment in memory.
1593
1594TCP settings
1595------------
1596
1597tcp_ecn
1598-------
1599
1600This file controls the use of the ECN bit in the IPv4 headers. This is a new
1601feature about Explicit Congestion Notification, but some routers and firewalls
1602block traffic that has this bit set, so it could be necessary to echo 0 to
1603/proc/sys/net/ipv4/tcp_ecn if you want to talk to these sites. For more info
1604you could read RFC2481.
1605
1606tcp_retrans_collapse
1607--------------------
1608
1609Bug-to-bug compatibility with some broken printers. On retransmit, try to send
1610larger packets to work around bugs in certain TCP stacks. Can be turned off by
1611setting it to zero.
1612
1613tcp_keepalive_probes
1614--------------------
1615
1616Number of keep alive probes TCP sends out, until it decides that the
1617connection is broken.
1618
1619tcp_keepalive_time
1620------------------
1621
1622How often TCP sends out keep alive messages, when keep alive is enabled. The
1623default is 2 hours.
1624
1625tcp_syn_retries
1626---------------
1627
1628Number of times initial SYNs for a TCP connection attempt will be
1629retransmitted. Should not be higher than 255. This is only the timeout for
1630outgoing connections, for incoming connections the number of retransmits is
1631defined by tcp_retries1.
1632
1633tcp_sack
1634--------
1635
1636Enable select acknowledgments after RFC2018.
1637
1638tcp_timestamps
1639--------------
1640
1641Enable timestamps as defined in RFC1323.
1642
1643tcp_stdurg
1644----------
1645
1646Enable the strict RFC793 interpretation of the TCP urgent pointer field. The
1647default is to use the BSD compatible interpretation of the urgent pointer
1648pointing to the first byte after the urgent data. The RFC793 interpretation is
1649to have it point to the last byte of urgent data. Enabling this option may
1650lead to interoperability problems. Disabled by default.
1651
1652tcp_syncookies
1653--------------
1654
1655Only valid when the kernel was compiled with CONFIG_SYNCOOKIES. Send out
1656syncookies when the syn backlog queue of a socket overflows. This is to ward
1657off the common 'syn flood attack'. Disabled by default.
1658
1659Note that the concept of a socket backlog is abandoned. This means the peer
1660may not receive reliable error messages from an over loaded server with
1661syncookies enabled.
1662
1663tcp_window_scaling
1664------------------
1665
1666Enable window scaling as defined in RFC1323.
1667
1668tcp_fin_timeout
1669---------------
1670
1671The length of time in seconds it takes to receive a final FIN before the
1672socket is always closed. This is strictly a violation of the TCP
1673specification, but required to prevent denial-of-service attacks.
1674
1675tcp_max_ka_probes
1676-----------------
1677
1678Indicates how many keep alive probes are sent per slow timer run. Should not
1679be set too high to prevent bursts.
1680
1681tcp_max_syn_backlog
1682-------------------
1683
1684Length of the per socket backlog queue. Since Linux 2.2 the backlog specified
1685in listen(2) only specifies the length of the backlog queue of already
1686established sockets. When more connection requests arrive Linux starts to drop
1687packets. When syncookies are enabled the packets are still answered and the
1688maximum queue is effectively ignored.
1689
1690tcp_retries1
1691------------
1692
1693Defines how often an answer to a TCP connection request is retransmitted
1694before giving up.
1695
1696tcp_retries2
1697------------
1698
1699Defines how often a TCP packet is retransmitted before giving up.
1700
1701Interface specific settings
1702---------------------------
1703
1704In the directory /proc/sys/net/ipv4/conf you'll find one subdirectory for each
1705interface the system knows about and one directory calls all. Changes in the
1706all subdirectory affect all interfaces, whereas changes in the other
1707subdirectories affect only one interface. All directories have the same
1708entries:
1709
1710accept_redirects
1711----------------
1712
1713This switch decides if the kernel accepts ICMP redirect messages or not. The
1714default is 'yes' if the kernel is configured for a regular host and 'no' for a
1715router configuration.
1716
1717accept_source_route
1718-------------------
1719
1720Should source routed packages be accepted or declined. The default is
1721dependent on the kernel configuration. It's 'yes' for routers and 'no' for
1722hosts.
1723
1724bootp_relay
1725~~~~~~~~~~~
1726
1727Accept packets with source address 0.b.c.d with destinations not to this host
1728as local ones. It is supposed that a BOOTP relay daemon will catch and forward
1729such packets.
1730
1731The default is 0, since this feature is not implemented yet (kernel version
17322.2.12).
1733
1734forwarding
1735----------
1736
1737Enable or disable IP forwarding on this interface.
1738
1739log_martians
1740------------
1741
1742Log packets with source addresses with no known route to kernel log.
1743
1744mc_forwarding
1745-------------
1746
1747Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE and a
1748multicast routing daemon is required.
1749
1750proxy_arp
1751---------
1752
1753Does (1) or does not (0) perform proxy ARP.
1754
1755rp_filter
1756---------
1757
1758Integer value determines if a source validation should be made. 1 means yes, 0
1759means no. Disabled by default, but local/broadcast address spoofing is always
1760on.
1761
1762If you set this to 1 on a router that is the only connection for a network to
1763the net, it will prevent spoofing attacks against your internal networks
1764(external addresses can still be spoofed), without the need for additional
1765firewall rules.
1766
1767secure_redirects
1768----------------
1769
1770Accept ICMP redirect messages only for gateways, listed in default gateway
1771list. Enabled by default.
1772
1773shared_media
1774------------
1775
1776If it is not set the kernel does not assume that different subnets on this
1777device can communicate directly. Default setting is 'yes'.
1778
1779send_redirects
1780--------------
1781
1782Determines whether to send ICMP redirects to other hosts.
1783
1784Routing settings
1785----------------
1786
1787The directory /proc/sys/net/ipv4/route contains several file to control
1788routing issues.
1789
1790error_burst and error_cost
1791--------------------------
1792
1793These parameters are used to limit how many ICMP destination unreachable to
1794send from the host in question. ICMP destination unreachable messages are
1795sent when we cannot reach the next hop while trying to transmit a packet.
1796It will also print some error messages to kernel logs if someone is ignoring
1797our ICMP redirects. The higher the error_cost factor is, the fewer
1798destination unreachable and error messages will be let through. Error_burst
1799controls when destination unreachable messages and error messages will be
1800dropped. The default settings limit warning messages to five every second.
1801
1802flush
1803-----
1804
1805Writing to this file results in a flush of the routing cache.
1806
1807gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
1808---------------------------------------------------------------------
1809
1810Values to control the frequency and behavior of the garbage collection
1811algorithm for the routing cache. gc_min_interval is deprecated and replaced
1812by gc_min_interval_ms.
1813
1814
1815max_size
1816--------
1817
1818Maximum size of the routing cache. Old entries will be purged once the cache
1819reached has this size.
1820
1821redirect_load, redirect_number
1822------------------------------
1823
1824Factors which determine if more ICPM redirects should be sent to a specific
1825host. No redirects will be sent once the load limit or the maximum number of
1826redirects has been reached.
1827
1828redirect_silence
1829----------------
1830
1831Timeout for redirects. After this period redirects will be sent again, even if
1832this has been stopped, because the load or number limit has been reached.
1833
1834Network Neighbor handling
1835-------------------------
1836
1837Settings about how to handle connections with direct neighbors (nodes attached
1838to the same link) can be found in the directory /proc/sys/net/ipv4/neigh.
1839
1840As we saw it in the conf directory, there is a default subdirectory which
1841holds the default values, and one directory for each interface. The contents
1842of the directories are identical, with the single exception that the default
1843settings contain additional options to set garbage collection parameters.
1844
1845In the interface directories you'll find the following entries:
1846
1847base_reachable_time, base_reachable_time_ms
1848-------------------------------------------
1849
1850A base value used for computing the random reachable time value as specified
1851in RFC2461.
1852
1853Expression of base_reachable_time, which is deprecated, is in seconds.
1854Expression of base_reachable_time_ms is in milliseconds.
1855
1856retrans_time, retrans_time_ms
1857-----------------------------
1858
1859The time between retransmitted Neighbor Solicitation messages.
1860Used for address resolution and to determine if a neighbor is
1861unreachable.
1862
1863Expression of retrans_time, which is deprecated, is in 1/100 seconds (for
1864IPv4) or in jiffies (for IPv6).
1865Expression of retrans_time_ms is in milliseconds.
1866
1867unres_qlen
1868----------
1869
1870Maximum queue length for a pending arp request - the number of packets which
1871are accepted from other layers while the ARP address is still resolved.
1872
1873anycast_delay
1874-------------
1875
1876Maximum for random delay of answers to neighbor solicitation messages in
1877jiffies (1/100 sec). Not yet implemented (Linux does not have anycast support
1878yet).
1879
1880ucast_solicit
1881-------------
1882
1883Maximum number of retries for unicast solicitation.
1884
1885mcast_solicit
1886-------------
1887
1888Maximum number of retries for multicast solicitation.
1889
1890delay_first_probe_time
1891----------------------
1892
1893Delay for the first time probe if the neighbor is reachable. (see
1894gc_stale_time)
1895
1896locktime
1897--------
1898
1899An ARP/neighbor entry is only replaced with a new one if the old is at least
1900locktime old. This prevents ARP cache thrashing.
1901
1902proxy_delay
1903-----------
1904
1905Maximum time (real time is random [0..proxytime]) before answering to an ARP
1906request for which we have an proxy ARP entry. In some cases, this is used to
1907prevent network flooding.
1908
1909proxy_qlen
1910----------
1911
1912Maximum queue length of the delayed proxy arp timer. (see proxy_delay).
1913
1914app_solicit
1915----------
1916
1917Determines the number of requests to send to the user level ARP daemon. Use 0
1918to turn off.
1919
1920gc_stale_time
1921-------------
1922
1923Determines how often to check for stale ARP entries. After an ARP entry is
1924stale it will be resolved again (which is useful when an IP address migrates
1925to another machine). When ucast_solicit is greater than 0 it first tries to
1926send an ARP packet directly to the known host When that fails and
1927mcast_solicit is greater than 0, an ARP request is broadcasted.
1928
19292.9 Appletalk
1930-------------
1931
1932The /proc/sys/net/appletalk directory holds the Appletalk configuration data
1933when Appletalk is loaded. The configurable parameters are:
1934
1935aarp-expiry-time
1936----------------
1937
1938The amount of time we keep an ARP entry before expiring it. Used to age out
1939old hosts.
1940
1941aarp-resolve-time
1942-----------------
1943
1944The amount of time we will spend trying to resolve an Appletalk address.
1945
1946aarp-retransmit-limit
1947---------------------
1948
1949The number of times we will retransmit a query before giving up.
1950
1951aarp-tick-time
1952--------------
1953
1954Controls the rate at which expires are checked.
1955
1956The directory /proc/net/appletalk holds the list of active Appletalk sockets
1957on a machine.
1958
1959The fields indicate the DDP type, the local address (in network:node format)
1960the remote address, the size of the transmit pending queue, the size of the
1961received queue (bytes waiting for applications to read) the state and the uid
1962owning the socket.
1963
1964/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
1965shows the name of the interface, its Appletalk address, the network range on
1966that address (or network number for phase 1 networks), and the status of the
1967interface.
1968
1969/proc/net/atalk_route lists each known network route. It lists the target
1970(network) that the route leads to, the router (may be directly connected), the
1971route flags, and the device the route is using.
1972
19732.10 IPX
1974--------
1975
1976The IPX protocol has no tunable values in proc/sys/net.
1977
1978The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
1979socket giving the local and remote addresses in Novell format (that is
1980network:node:port). In accordance with the strange Novell tradition,
1981everything but the port is in hex. Not_Connected is displayed for sockets that
1982are not tied to a specific remote address. The Tx and Rx queue sizes indicate
1983the number of bytes pending for transmission and reception. The state
1984indicates the state the socket is in and the uid is the owning uid of the
1985socket.
1986
1987The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
1988it gives the network number, the node number, and indicates if the network is
1989the primary network. It also indicates which device it is bound to (or
1990Internal for internal networks) and the Frame Type if appropriate. Linux
1991supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
1992IPX.
1993
1994The /proc/net/ipx_route table holds a list of IPX routes. For each route it
1995gives the destination network, the router node (or Directly) and the network
1996address of the router (or Connected) for internal networks.
1997
19982.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
1999----------------------------------------------------------
2000
2001The "mqueue" filesystem provides the necessary kernel features to enable the
2002creation of a user space library that implements the POSIX message queues
2003API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
2004Interfaces specification.)
2005
2006The "mqueue" filesystem contains values for determining/setting the amount of
2007resources used by the file system.
2008
2009/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
2010maximum number of message queues allowed on the system.
2011
2012/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
2013maximum number of messages in a queue value. In fact it is the limiting value
2014for another (user) limit which is set in mq_open invocation. This attribute of
2015a queue must be less or equal then msg_max.
2016
2017/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
2018maximum message size value (it is every message queue's attribute set during
2019its creation).
2020 1002
20212.12 /proc/<pid>/oom_adj - Adjust the oom-killer score 10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
2022------------------------------------------------------ 1004------------------------------------------------------
2023 1005
2024This file can be used to adjust the score used to select which processes 1006This file can be used to adjust the score used to select which processes
@@ -2055,25 +1037,15 @@ The task with the highest badness score is then selected and its children
2055are killed, process itself will be killed in an OOM situation when it does 1037are killed, process itself will be killed in an OOM situation when it does
2056not have children or some of them disabled oom like described above. 1038not have children or some of them disabled oom like described above.
2057 1039
20582.13 /proc/<pid>/oom_score - Display current oom-killer score 10403.2 /proc/<pid>/oom_score - Display current oom-killer score
2059------------------------------------------------------------- 1041-------------------------------------------------------------
2060 1042
2061------------------------------------------------------------------------------
2062This file can be used to check the current score used by the oom-killer is for 1043This file can be used to check the current score used by the oom-killer is for
2063any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which 1044any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
2064process should be killed in an out-of-memory situation. 1045process should be killed in an out-of-memory situation.
2065 1046
2066------------------------------------------------------------------------------
2067Summary
2068------------------------------------------------------------------------------
2069Certain aspects of kernel behavior can be modified at runtime, without the
2070need to recompile the kernel, or even to reboot the system. The files in the
2071/proc/sys tree can not only be read, but also modified. You can use the echo
2072command to write value into these files, thereby changing the default settings
2073of the kernel.
2074------------------------------------------------------------------------------
2075 1047
20762.14 /proc/<pid>/io - Display the IO accounting fields 10483.3 /proc/<pid>/io - Display the IO accounting fields
2077------------------------------------------------------- 1049-------------------------------------------------------
2078 1050
2079This file contains IO statistics for each running process 1051This file contains IO statistics for each running process
@@ -2175,7 +1147,7 @@ those 64-bit counters, process A could see an intermediate result.
2175More information about this can be found within the taskstats documentation in 1147More information about this can be found within the taskstats documentation in
2176Documentation/accounting. 1148Documentation/accounting.
2177 1149
21782.15 /proc/<pid>/coredump_filter - Core dump filtering settings 11503.4 /proc/<pid>/coredump_filter - Core dump filtering settings
2179--------------------------------------------------------------- 1151---------------------------------------------------------------
2180When a process is dumped, all anonymous memory is written to a core file as 1152When a process is dumped, all anonymous memory is written to a core file as
2181long as the size of the core file isn't limited. But sometimes we don't want 1153long as the size of the core file isn't limited. But sometimes we don't want
@@ -2219,7 +1191,7 @@ For example:
2219 $ echo 0x7 > /proc/self/coredump_filter 1191 $ echo 0x7 > /proc/self/coredump_filter
2220 $ ./some_program 1192 $ ./some_program
2221 1193
22222.16 /proc/<pid>/mountinfo - Information about mounts 11943.5 /proc/<pid>/mountinfo - Information about mounts
2223-------------------------------------------------------- 1195--------------------------------------------------------
2224 1196
2225This file contains lines of the form: 1197This file contains lines of the form:
@@ -2256,30 +1228,3 @@ For more information on mount propagation see:
2256 1228
2257 Documentation/filesystems/sharedsubtree.txt 1229 Documentation/filesystems/sharedsubtree.txt
2258 1230
22592.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
2260--------------------------------------------------------
2261
2262This directory contains configuration options for the epoll(7) interface.
2263
2264max_user_instances
2265------------------
2266
2267This is the maximum number of epoll file descriptors that a single user can
2268have open at a given time. The default value is 128, and should be enough
2269for normal users.
2270
2271max_user_watches
2272----------------
2273
2274Every epoll file descriptor can store a number of files to be monitored
2275for event readiness. Each one of these monitored files constitutes a "watch".
2276This configuration option sets the maximum number of "watches" that are
2277allowed for each user.
2278Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
2279on a 64bit one.
2280The current default value for max_user_watches is the 1/32 of the available
2281low memory, divided for the "watch" cost in bytes.
2282
2283
2284------------------------------------------------------------------------------
2285
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 3e79e4a7a392..b324c033035a 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -22,7 +22,7 @@ Squashfs filesystem features versus Cramfs:
22 22
23 Squashfs Cramfs 23 Squashfs Cramfs
24 24
25Max filesystem size: 2^64 16 MiB 25Max filesystem size: 2^64 256 MiB
26Max file size: ~ 2 TiB 16 MiB 26Max file size: ~ 2 TiB 16 MiB
27Max files: unlimited unlimited 27Max files: unlimited unlimited
28Max directories: unlimited unlimited 28Max directories: unlimited unlimited
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 9f8740ca3f3b..26e4b8bc53ee 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -12,6 +12,7 @@ that support it. For example, a given bus might look like this:
12 | |-- enable 12 | |-- enable
13 | |-- irq 13 | |-- irq
14 | |-- local_cpus 14 | |-- local_cpus
15 | |-- remove
15 | |-- resource 16 | |-- resource
16 | |-- resource0 17 | |-- resource0
17 | |-- resource1 18 | |-- resource1
@@ -36,6 +37,7 @@ files, each with their own function.
36 enable Whether the device is enabled (ascii, rw) 37 enable Whether the device is enabled (ascii, rw)
37 irq IRQ number (ascii, ro) 38 irq IRQ number (ascii, ro)
38 local_cpus nearby CPU mask (cpumask, ro) 39 local_cpus nearby CPU mask (cpumask, ro)
40 remove remove device from kernel's list (ascii, wo)
39 resource PCI resource host addresses (ascii, ro) 41 resource PCI resource host addresses (ascii, ro)
40 resource0..N PCI resource N, if present (binary, mmap) 42 resource0..N PCI resource N, if present (binary, mmap)
41 resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) 43 resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap)
@@ -46,6 +48,7 @@ files, each with their own function.
46 48
47 ro - read only file 49 ro - read only file
48 rw - file is readable and writable 50 rw - file is readable and writable
51 wo - write only file
49 mmap - file is mmapable 52 mmap - file is mmapable
50 ascii - file contains ascii text 53 ascii - file contains ascii text
51 binary - file contains binary data 54 binary - file contains binary data
@@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully.
73In the event a driver is not bound to the device, it can be enabled using the 76In the event a driver is not bound to the device, it can be enabled using the
74'enable' file, documented above. 77'enable' file, documented above.
75 78
79The 'remove' file is used to remove the PCI device, by writing a non-zero
80integer to the file. This does not involve any kind of hot-plug functionality,
81e.g. powering off the device. The device is removed from the kernel's list of
82PCI devices, the sysfs directory for it is removed, and the device will be
83removed from any drivers attached to it. Removal of PCI root buses is
84disallowed.
85
76Accessing legacy resources through sysfs 86Accessing legacy resources through sysfs
77---------------------------------------- 87----------------------------------------
78 88
diff --git a/Documentation/filesystems/udf.txt b/Documentation/filesystems/udf.txt
index fde829a756e6..902b95d0ee51 100644
--- a/Documentation/filesystems/udf.txt
+++ b/Documentation/filesystems/udf.txt
@@ -24,6 +24,8 @@ The following mount options are supported:
24 24
25 gid= Set the default group. 25 gid= Set the default group.
26 umask= Set the default umask. 26 umask= Set the default umask.
27 mode= Set the default file permissions.
28 dmode= Set the default directory permissions.
27 uid= Set the default user. 29 uid= Set the default user.
28 bs= Set the block size. 30 bs= Set the block size.
29 unhide Show otherwise hidden files. 31 unhide Show otherwise hidden files.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index deeeed0faa8f..f49eecf2e573 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -277,8 +277,7 @@ or bottom half).
277 unfreeze_fs: called when VFS is unlocking a filesystem and making it writable 277 unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
278 again. 278 again.
279 279
280 statfs: called when the VFS needs to get filesystem statistics. This 280 statfs: called when the VFS needs to get filesystem statistics.
281 is called with the kernel lock held
282 281
283 remount_fs: called when the filesystem is remounted. This is called 282 remount_fs: called when the filesystem is remounted. This is called
284 with the kernel lock held 283 with the kernel lock held
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index b1b988701247..145c25a170c7 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -123,7 +123,10 @@ platform-specific implementation issue.
123 123
124Using GPIOs 124Using GPIOs
125----------- 125-----------
126One of the first things to do with a GPIO, often in board setup code when 126The first thing a system should do with a GPIO is allocate it, using
127the gpio_request() call; see later.
128
129One of the next things to do with a GPIO, often in board setup code when
127setting up a platform_device using the GPIO, is mark its direction: 130setting up a platform_device using the GPIO, is mark its direction:
128 131
129 /* set as input or output, returning 0 or negative errno */ 132 /* set as input or output, returning 0 or negative errno */
@@ -141,8 +144,8 @@ This helps avoid signal glitching during system startup.
141 144
142For compatibility with legacy interfaces to GPIOs, setting the direction 145For compatibility with legacy interfaces to GPIOs, setting the direction
143of a GPIO implicitly requests that GPIO (see below) if it has not been 146of a GPIO implicitly requests that GPIO (see below) if it has not been
144requested already. That compatibility may be removed in the future; 147requested already. That compatibility is being removed from the optional
145explicitly requesting GPIOs is strongly preferred. 148gpiolib framework.
146 149
147Setting the direction can fail if the GPIO number is invalid, or when 150Setting the direction can fail if the GPIO number is invalid, or when
148that particular GPIO can't be used in that mode. It's generally a bad 151that particular GPIO can't be used in that mode. It's generally a bad
@@ -195,7 +198,7 @@ This requires sleeping, which can't be done from inside IRQ handlers.
195 198
196Platforms that support this type of GPIO distinguish them from other GPIOs 199Platforms that support this type of GPIO distinguish them from other GPIOs
197by returning nonzero from this call (which requires a valid GPIO number, 200by returning nonzero from this call (which requires a valid GPIO number,
198either explicitly or implicitly requested): 201which should have been previously allocated with gpio_request):
199 202
200 int gpio_cansleep(unsigned gpio); 203 int gpio_cansleep(unsigned gpio);
201 204
@@ -212,10 +215,9 @@ for GPIOs that can't be accessed from IRQ handlers, these calls act the
212same as the spinlock-safe calls. 215same as the spinlock-safe calls.
213 216
214 217
215Claiming and Releasing GPIOs (OPTIONAL) 218Claiming and Releasing GPIOs
216--------------------------------------- 219----------------------------
217To help catch system configuration errors, two calls are defined. 220To help catch system configuration errors, two calls are defined.
218However, many platforms don't currently support this mechanism.
219 221
220 /* request GPIO, returning 0 or negative errno. 222 /* request GPIO, returning 0 or negative errno.
221 * non-null labels may be useful for diagnostics. 223 * non-null labels may be useful for diagnostics.
@@ -244,13 +246,6 @@ Some platforms may also use knowledge about what GPIOs are active for
244power management, such as by powering down unused chip sectors and, more 246power management, such as by powering down unused chip sectors and, more
245easily, gating off unused clocks. 247easily, gating off unused clocks.
246 248
247These two calls are optional because not not all current Linux platforms
248offer such functionality in their GPIO support; a valid implementation
249could return success for all gpio_request() calls. Unlike the other calls,
250the state they represent doesn't normally match anything from a hardware
251register; it's just a software bitmap which clearly is not necessary for
252correct operation of hardware or (bug free) drivers.
253
254Note that requesting a GPIO does NOT cause it to be configured in any 249Note that requesting a GPIO does NOT cause it to be configured in any
255way; it just marks that GPIO as in use. Separate code must handle any 250way; it just marks that GPIO as in use. Separate code must handle any
256pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown). 251pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
diff --git a/Documentation/hwmon/ds1621 b/Documentation/hwmon/ds1621
index 1fee6f1e6bc5..5e97f333c4df 100644
--- a/Documentation/hwmon/ds1621
+++ b/Documentation/hwmon/ds1621
@@ -49,12 +49,9 @@ of up to +/- 0.5 degrees even when compared against precise temperature
49readings. Be sure to have a high vs. low temperature limit gap of al least 49readings. Be sure to have a high vs. low temperature limit gap of al least
501.0 degree Celsius to avoid Tout "bouncing", though! 501.0 degree Celsius to avoid Tout "bouncing", though!
51 51
52As for alarms, you can read the alarm status of the DS1621 via the 'alarms' 52The alarm bits are set when the high or low limits are met or exceeded and
53/sys file interface. The result consists mainly of bit 6 and 5 of the 53are reset by the module as soon as the respective temperature ranges are
54configuration register of the chip; bit 6 (0x40 or 64) is the high alarm 54left.
55bit and bit 5 (0x20 or 32) the low one. These bits are set when the high or
56low limits are met or exceeded and are reset by the module as soon as the
57respective temperature ranges are left.
58 55
59The alarm registers are in no way suitable to find out about the actual 56The alarm registers are in no way suitable to find out about the actual
60status of Tout. They will only tell you about its history, whether or not 57status of Tout. They will only tell you about its history, whether or not
@@ -64,45 +61,3 @@ with neither of the alarms set.
64 61
65Temperature conversion of the DS1621 takes up to 1000ms; internal access to 62Temperature conversion of the DS1621 takes up to 1000ms; internal access to
66non-volatile registers may last for 10ms or below. 63non-volatile registers may last for 10ms or below.
67
68High Accuracy Temperature Reading
69---------------------------------
70
71As said before, the temperature issued via the 9-bit i2c-bus data is
72somewhat arbitrary. Internally, the temperature conversion is of a
73different kind that is explained (not so...) well in the DS1621 data sheet.
74To cut the long story short: Inside the DS1621 there are two oscillators,
75both of them biassed by a temperature coefficient.
76
77Higher resolution of the temperature reading can be achieved using the
78internal projection, which means taking account of REG_COUNT and REG_SLOPE
79(the driver manages them):
80
81Taken from Dallas Semiconductors App Note 068: 'Increasing Temperature
82Resolution on the DS1620' and App Note 105: 'High Resolution Temperature
83Measurement with Dallas Direct-to-Digital Temperature Sensors'
84
85- Read the 9-bit temperature and strip the LSB (Truncate the .5 degs)
86- The resulting value is TEMP_READ.
87- Then, read REG_COUNT.
88- And then, REG_SLOPE.
89
90 TEMP = TEMP_READ - 0.25 + ((REG_SLOPE - REG_COUNT) / REG_SLOPE)
91
92Note that this is what the DONE bit in the DS1621 configuration register is
93good for: Internally, one temperature conversion takes up to 1000ms. Before
94that conversion is complete you will not be able to read valid things out
95of REG_COUNT and REG_SLOPE. The DONE bit, as you may have guessed by now,
96tells you whether the conversion is complete ("done", in plain English) and
97thus, whether the values you read are good or not.
98
99The DS1621 has two modes of operation: "Continuous" conversion, which can
100be understood as the default stand-alone mode where the chip gets the
101temperature and controls external devices via its Tout pin or tells other
102i2c's about it if they care. The other mode is called "1SHOT", that means
103that it only figures out about the temperature when it is explicitly told
104to do so; this can be seen as power saving mode.
105
106Now if you want to read REG_COUNT and REG_SLOPE, you have to either stop
107the continuous conversions until the contents of these registers are valid,
108or, in 1SHOT mode, you have to have one conversion made.
diff --git a/Documentation/hwmon/g760a b/Documentation/hwmon/g760a
new file mode 100644
index 000000000000..e032eeb75629
--- /dev/null
+++ b/Documentation/hwmon/g760a
@@ -0,0 +1,36 @@
1Kernel driver g760a
2===================
3
4Supported chips:
5 * Global Mixed-mode Technology Inc. G760A
6 Prefix: 'g760a'
7 Datasheet: Publicly available at the GMT website
8 http://www.gmt.com.tw/datasheet/g760a.pdf
9
10Author: Herbert Valerio Riedel <hvr@gnu.org>
11
12Description
13-----------
14
15The GMT G760A Fan Speed PWM Controller is connected directly to a fan
16and performs closed-loop control of the fan speed.
17
18The fan speed is programmed by setting the period via 'pwm1' of two
19consecutive speed pulses. The period is defined in terms of clock
20cycle counts of an assumed 32kHz clock source.
21
22Setting a period of 0 stops the fan; setting the period to 255 sets
23fan to maximum speed.
24
25The measured fan rotation speed returned via 'fan1_input' is derived
26from the measured speed pulse period by assuming again a 32kHz clock
27source and a 2 pulse-per-revolution fan.
28
29The 'alarms' file provides access to the two alarm bits provided by
30the G760A chip's status register: Bit 0 is set when the actual fan
31speed differs more than 20% with respect to the programmed fan speed;
32bit 1 is set when fan speed is below 1920 RPM.
33
34The g760a driver will not update its values more frequently than every
35other second; reading them more often will do no harm, but will return
36'old' values.
diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d
index 287f8c902656..effe949a7282 100644
--- a/Documentation/hwmon/lis3lv02d
+++ b/Documentation/hwmon/lis3lv02d
@@ -1,11 +1,11 @@
1Kernel driver lis3lv02d 1Kernel driver lis3lv02d
2================== 2=======================
3 3
4Supported chips: 4Supported chips:
5 5
6 * STMicroelectronics LIS3LV02DL and LIS3LV02DQ 6 * STMicroelectronics LIS3LV02DL and LIS3LV02DQ
7 7
8Author: 8Authors:
9 Yan Burman <burman.yan@gmail.com> 9 Yan Burman <burman.yan@gmail.com>
10 Eric Piel <eric.piel@tremplin-utc.net> 10 Eric Piel <eric.piel@tremplin-utc.net>
11 11
@@ -15,7 +15,7 @@ Description
15 15
16This driver provides support for the accelerometer found in various HP 16This driver provides support for the accelerometer found in various HP
17laptops sporting the feature officially called "HP Mobile Data 17laptops sporting the feature officially called "HP Mobile Data
18Protection System 3D" or "HP 3D DriveGuard". It detect automatically 18Protection System 3D" or "HP 3D DriveGuard". It detects automatically
19laptops with this sensor. Known models (for now the HP 2133, nc6420, 19laptops with this sensor. Known models (for now the HP 2133, nc6420,
20nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis 20nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis
21automatically oriented on standard way (eg: you can directly play 21automatically oriented on standard way (eg: you can directly play
@@ -27,7 +27,7 @@ position - 3D position that the accelerometer reports. Format: "(x,y,z)"
27calibrate - read: values (x, y, z) that are used as the base for input 27calibrate - read: values (x, y, z) that are used as the base for input
28 class device operation. 28 class device operation.
29 write: forces the base to be recalibrated with the current 29 write: forces the base to be recalibrated with the current
30 position. 30 position.
31rate - reports the sampling rate of the accelerometer device in HZ 31rate - reports the sampling rate of the accelerometer device in HZ
32 32
33This driver also provides an absolute input class device, allowing 33This driver also provides an absolute input class device, allowing
@@ -48,7 +48,7 @@ For better compatibility between the various laptops. The values reported by
48the accelerometer are converted into a "standard" organisation of the axes 48the accelerometer are converted into a "standard" organisation of the axes
49(aka "can play neverball out of the box"): 49(aka "can play neverball out of the box"):
50 * When the laptop is horizontal the position reported is about 0 for X and Y 50 * When the laptop is horizontal the position reported is about 0 for X and Y
51and a positive value for Z 51 and a positive value for Z
52 * If the left side is elevated, X increases (becomes positive) 52 * If the left side is elevated, X increases (becomes positive)
53 * If the front side (where the touchpad is) is elevated, Y decreases 53 * If the front side (where the touchpad is) is elevated, Y decreases
54 (becomes negative) 54 (becomes negative)
@@ -59,3 +59,13 @@ email to the authors to add it to the database. When reporting a new
59laptop, please include the output of "dmidecode" plus the value of 59laptop, please include the output of "dmidecode" plus the value of
60/sys/devices/platform/lis3lv02d/position in these four cases. 60/sys/devices/platform/lis3lv02d/position in these four cases.
61 61
62Q&A
63---
64
65Q: How do I safely simulate freefall? I have an HP "portable
66workstation" which has about 3.5kg and a plastic case, so letting it
67fall to the ground is out of question...
68
69A: The sensor is pretty sensitive, so your hands can do it. Lift it
70into free space, follow the fall with your hands for like 10
71centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/hwmon/lm90 b/Documentation/hwmon/lm90
index 0e8411710238..93d8e3d55150 100644
--- a/Documentation/hwmon/lm90
+++ b/Documentation/hwmon/lm90
@@ -42,6 +42,11 @@ Supported chips:
42 Addresses scanned: I2C 0x4e 42 Addresses scanned: I2C 0x4e
43 Datasheet: Publicly available at the Maxim website 43 Datasheet: Publicly available at the Maxim website
44 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3497 44 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3497
45 * Maxim MAX6648
46 Prefix: 'max6646'
47 Addresses scanned: I2C 0x4c
48 Datasheet: Publicly available at the Maxim website
49 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500
45 * Maxim MAX6649 50 * Maxim MAX6649
46 Prefix: 'max6646' 51 Prefix: 'max6646'
47 Addresses scanned: I2C 0x4c 52 Addresses scanned: I2C 0x4c
@@ -74,6 +79,11 @@ Supported chips:
74 0x4c, 0x4d and 0x4e 79 0x4c, 0x4d and 0x4e
75 Datasheet: Publicly available at the Maxim website 80 Datasheet: Publicly available at the Maxim website
76 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3370 81 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3370
82 * Maxim MAX6692
83 Prefix: 'max6646'
84 Addresses scanned: I2C 0x4c
85 Datasheet: Publicly available at the Maxim website
86 http://www.maxim-ic.com/quick_view2.cfm/qv_pk/3500
77 87
78 88
79Author: Jean Delvare <khali@linux-fr.org> 89Author: Jean Delvare <khali@linux-fr.org>
diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215
new file mode 100644
index 000000000000..2e6a21eb656c
--- /dev/null
+++ b/Documentation/hwmon/ltc4215
@@ -0,0 +1,50 @@
1Kernel driver ltc4215
2=====================
3
4Supported chips:
5 * Linear Technology LTC4215
6 Prefix: 'ltc4215'
7 Addresses scanned: 0x44
8 Datasheet:
9 http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697
10
11Author: Ira W. Snyder <iws@ovro.caltech.edu>
12
13
14Description
15-----------
16
17The LTC4215 controller allows a board to be safely inserted and removed
18from a live backplane.
19
20
21Usage Notes
22-----------
23
24This driver does not probe for LTC4215 devices, due to the fact that some
25of the possible addresses are unfriendly to probing. You will need to use
26the "force" parameter to tell the driver where to find the device.
27
28Example: the following will load the driver for an LTC4215 at address 0x44
29on I2C bus #0:
30$ modprobe ltc4215 force=0,0x44
31
32
33Sysfs entries
34-------------
35
36The LTC4215 has built-in limits for overvoltage, undervoltage, and
37undercurrent warnings. This makes it very likely that the reference
38circuit will be used.
39
40in1_input input voltage
41in2_input output voltage
42
43in1_min_alarm input undervoltage alarm
44in1_max_alarm input overvoltage alarm
45
46curr1_input current
47curr1_max_alarm overcurrent alarm
48
49power1_input power usage
50power1_alarm power bad alarm
diff --git a/Documentation/i2c/chips/pcf8591 b/Documentation/hwmon/pcf8591
index 5628fcf4207f..5628fcf4207f 100644
--- a/Documentation/i2c/chips/pcf8591
+++ b/Documentation/hwmon/pcf8591
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 6dbfd5efd991..2f10ce6a879f 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -365,6 +365,7 @@ energy[1-*]_input Cumulative energy use
365 Unit: microJoule 365 Unit: microJoule
366 RO 366 RO
367 367
368
368********** 369**********
369* Alarms * 370* Alarms *
370********** 371**********
@@ -453,6 +454,27 @@ beep_mask Bitmask for beep.
453 RW 454 RW
454 455
455 456
457***********************
458* Intrusion detection *
459***********************
460
461intrusion[0-*]_alarm
462 Chassis intrusion detection
463 0: OK
464 1: intrusion detected
465 RW
466 Contrary to regular alarm flags which clear themselves
467 automatically when read, this one sticks until cleared by
468 the user. This is done by writing 0 to the file. Writing
469 other values is unsupported.
470
471intrusion[0-*]_beep
472 Chassis intrusion beep
473 0: disable
474 1: enable
475 RW
476
477
456sysfs attribute writes interpretation 478sysfs attribute writes interpretation
457------------------------------------- 479-------------------------------------
458 480
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index d6e1ae30fa6e..b6eb59384bb3 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -2,30 +2,40 @@ Kernel driver w83627ehf
2======================= 2=======================
3 3
4Supported chips: 4Supported chips:
5 * Winbond W83627EHF/EHG/DHG (ISA access ONLY) 5 * Winbond W83627EHF/EHG (ISA access ONLY)
6 Prefix: 'w83627ehf' 6 Prefix: 'w83627ehf'
7 Addresses scanned: ISA address retrieved from Super I/O registers 7 Addresses scanned: ISA address retrieved from Super I/O registers
8 Datasheet: 8 Datasheet:
9 http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83627EHF_%20W83627EHGb.pdf 9 http://www.nuvoton.com.tw/NR/rdonlyres/A6A258F0-F0C9-4F97-81C0-C4D29E7E943E/0/W83627EHF.pdf
10 DHG datasheet confidential. 10 * Winbond W83627DHG
11 Prefix: 'w83627dhg'
12 Addresses scanned: ISA address retrieved from Super I/O registers
13 Datasheet:
14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf
15 * Winbond W83667HG
16 Prefix: 'w83667hg'
17 Addresses scanned: ISA address retrieved from Super I/O registers
18 Datasheet: not available
11 19
12Authors: 20Authors:
13 Jean Delvare <khali@linux-fr.org> 21 Jean Delvare <khali@linux-fr.org>
14 Yuan Mu (Winbond) 22 Yuan Mu (Winbond)
15 Rudolf Marek <r.marek@assembler.cz> 23 Rudolf Marek <r.marek@assembler.cz>
16 David Hubbard <david.c.hubbard@gmail.com> 24 David Hubbard <david.c.hubbard@gmail.com>
25 Gong Jun <JGong@nuvoton.com>
17 26
18Description 27Description
19----------- 28-----------
20 29
21This driver implements support for the Winbond W83627EHF, W83627EHG, and 30This driver implements support for the Winbond W83627EHF, W83627EHG,
22W83627DHG super I/O chips. We will refer to them collectively as Winbond chips. 31W83627DHG and W83667HG super I/O chips. We will refer to them collectively
32as Winbond chips.
23 33
24The chips implement three temperature sensors, five fan rotation 34The chips implement three temperature sensors, five fan rotation
25speed sensors, ten analog voltage sensors (only nine for the 627DHG), one 35speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
26VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep 36VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG and 667HG), alarms
27warnings (control unimplemented), and some automatic fan regulation 37with beep warnings (control unimplemented), and some automatic fan
28strategies (plus manual fan control mode). 38regulation strategies (plus manual fan control mode).
29 39
30Temperatures are measured in degrees Celsius and measurement resolution is 1 40Temperatures are measured in degrees Celsius and measurement resolution is 1
31degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when 41degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
@@ -54,7 +64,8 @@ follows:
54temp1 -> pwm1 64temp1 -> pwm1
55temp2 -> pwm2 65temp2 -> pwm2
56temp3 -> pwm3 66temp3 -> pwm3
57prog -> pwm4 (the programmable setting is not supported by the driver) 67prog -> pwm4 (not on 667HG; the programmable setting is not supported by
68 the driver)
58 69
59/sys files 70/sys files
60---------- 71----------
diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
index fae3495bcbaf..9698c396b830 100644
--- a/Documentation/i2c/busses/i2c-nforce2
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -7,10 +7,14 @@ Supported adapters:
7 * nForce3 250Gb MCP 10de:00E4 7 * nForce3 250Gb MCP 10de:00E4
8 * nForce4 MCP 10de:0052 8 * nForce4 MCP 10de:0052
9 * nForce4 MCP-04 10de:0034 9 * nForce4 MCP-04 10de:0034
10 * nForce4 MCP51 10de:0264 10 * nForce MCP51 10de:0264
11 * nForce4 MCP55 10de:0368 11 * nForce MCP55 10de:0368
12 * nForce4 MCP61 10de:03EB 12 * nForce MCP61 10de:03EB
13 * nForce4 MCP65 10de:0446 13 * nForce MCP65 10de:0446
14 * nForce MCP67 10de:0542
15 * nForce MCP73 10de:07D8
16 * nForce MCP78S 10de:0752
17 * nForce MCP79 10de:0AA2
14 18
15Datasheet: not publicly available, but seems to be similar to the 19Datasheet: not publicly available, but seems to be similar to the
16 AMD-8111 SMBus 2.0 adapter. 20 AMD-8111 SMBus 2.0 adapter.
diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index ef1efa79b1df..f889481762b5 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -4,7 +4,7 @@ Supported adapters:
4 * Intel 82371AB PIIX4 and PIIX4E 4 * Intel 82371AB PIIX4 and PIIX4E
5 * Intel 82443MX (440MX) 5 * Intel 82443MX (440MX)
6 Datasheet: Publicly available at the Intel website 6 Datasheet: Publicly available at the Intel website
7 * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges 7 * ServerWorks OSB4, CSB5, CSB6, HT-1000 and HT-1100 southbridges
8 Datasheet: Only available via NDA from ServerWorks 8 Datasheet: Only available via NDA from ServerWorks
9 * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges 9 * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges
10 Datasheet: Not publicly available 10 Datasheet: Not publicly available
diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
new file mode 100644
index 000000000000..b55ce57a84db
--- /dev/null
+++ b/Documentation/i2c/instantiating-devices
@@ -0,0 +1,167 @@
1How to instantiate I2C devices
2==============================
3
4Unlike PCI or USB devices, I2C devices are not enumerated at the hardware
5level. Instead, the software must know which devices are connected on each
6I2C bus segment, and what address these devices are using. For this
7reason, the kernel code must instantiate I2C devices explicitly. There are
8several ways to achieve this, depending on the context and requirements.
9
10
11Method 1: Declare the I2C devices by bus number
12-----------------------------------------------
13
14This method is appropriate when the I2C bus is a system bus as is the case
15for many embedded systems. On such systems, each I2C bus has a number
16which is known in advance. It is thus possible to pre-declare the I2C
17devices which live on this bus. This is done with an array of struct
18i2c_board_info which is registered by calling i2c_register_board_info().
19
20Example (from omap2 h4):
21
22static struct i2c_board_info __initdata h4_i2c_board_info[] = {
23 {
24 I2C_BOARD_INFO("isp1301_omap", 0x2d),
25 .irq = OMAP_GPIO_IRQ(125),
26 },
27 { /* EEPROM on mainboard */
28 I2C_BOARD_INFO("24c01", 0x52),
29 .platform_data = &m24c01,
30 },
31 { /* EEPROM on cpu card */
32 I2C_BOARD_INFO("24c01", 0x57),
33 .platform_data = &m24c01,
34 },
35};
36
37static void __init omap_h4_init(void)
38{
39 (...)
40 i2c_register_board_info(1, h4_i2c_board_info,
41 ARRAY_SIZE(h4_i2c_board_info));
42 (...)
43}
44
45The above code declares 3 devices on I2C bus 1, including their respective
46addresses and custom data needed by their drivers. When the I2C bus in
47question is registered, the I2C devices will be instantiated automatically
48by i2c-core.
49
50The devices will be automatically unbound and destroyed when the I2C bus
51they sit on goes away (if ever.)
52
53
54Method 2: Instantiate the devices explicitly
55--------------------------------------------
56
57This method is appropriate when a larger device uses an I2C bus for
58internal communication. A typical case is TV adapters. These can have a
59tuner, a video decoder, an audio decoder, etc. usually connected to the
60main chip by the means of an I2C bus. You won't know the number of the I2C
61bus in advance, so the method 1 described above can't be used. Instead,
62you can instantiate your I2C devices explicitly. This is done by filling
63a struct i2c_board_info and calling i2c_new_device().
64
65Example (from the sfe4001 network driver):
66
67static struct i2c_board_info sfe4001_hwmon_info = {
68 I2C_BOARD_INFO("max6647", 0x4e),
69};
70
71int sfe4001_init(struct efx_nic *efx)
72{
73 (...)
74 efx->board_info.hwmon_client =
75 i2c_new_device(&efx->i2c_adap, &sfe4001_hwmon_info);
76
77 (...)
78}
79
80The above code instantiates 1 I2C device on the I2C bus which is on the
81network adapter in question.
82
83A variant of this is when you don't know for sure if an I2C device is
84present or not (for example for an optional feature which is not present
85on cheap variants of a board but you have no way to tell them apart), or
86it may have different addresses from one board to the next (manufacturer
87changing its design without notice). In this case, you can call
88i2c_new_probed_device() instead of i2c_new_device().
89
90Example (from the pnx4008 OHCI driver):
91
92static const unsigned short normal_i2c[] = { 0x2c, 0x2d, I2C_CLIENT_END };
93
94static int __devinit usb_hcd_pnx4008_probe(struct platform_device *pdev)
95{
96 (...)
97 struct i2c_adapter *i2c_adap;
98 struct i2c_board_info i2c_info;
99
100 (...)
101 i2c_adap = i2c_get_adapter(2);
102 memset(&i2c_info, 0, sizeof(struct i2c_board_info));
103 strlcpy(i2c_info.name, "isp1301_pnx", I2C_NAME_SIZE);
104 isp1301_i2c_client = i2c_new_probed_device(i2c_adap, &i2c_info,
105 normal_i2c);
106 i2c_put_adapter(i2c_adap);
107 (...)
108}
109
110The above code instantiates up to 1 I2C device on the I2C bus which is on
111the OHCI adapter in question. It first tries at address 0x2c, if nothing
112is found there it tries address 0x2d, and if still nothing is found, it
113simply gives up.
114
115The driver which instantiated the I2C device is responsible for destroying
116it on cleanup. This is done by calling i2c_unregister_device() on the
117pointer that was earlier returned by i2c_new_device() or
118i2c_new_probed_device().
119
120
121Method 3: Probe an I2C bus for certain devices
122----------------------------------------------
123
124Sometimes you do not have enough information about an I2C device, not even
125to call i2c_new_probed_device(). The typical case is hardware monitoring
126chips on PC mainboards. There are several dozen models, which can live
127at 25 different addresses. Given the huge number of mainboards out there,
128it is next to impossible to build an exhaustive list of the hardware
129monitoring chips being used. Fortunately, most of these chips have
130manufacturer and device ID registers, so they can be identified by
131probing.
132
133In that case, I2C devices are neither declared nor instantiated
134explicitly. Instead, i2c-core will probe for such devices as soon as their
135drivers are loaded, and if any is found, an I2C device will be
136instantiated automatically. In order to prevent any misbehavior of this
137mechanism, the following restrictions apply:
138* The I2C device driver must implement the detect() method, which
139 identifies a supported device by reading from arbitrary registers.
140* Only buses which are likely to have a supported device and agree to be
141 probed, will be probed. For example this avoids probing for hardware
142 monitoring chips on a TV adapter.
143
144Example:
145See lm90_driver and lm90_detect() in drivers/hwmon/lm90.c
146
147I2C devices instantiated as a result of such a successful probe will be
148destroyed automatically when the driver which detected them is removed,
149or when the underlying I2C bus is itself destroyed, whichever happens
150first.
151
152Those of you familiar with the i2c subsystem of 2.4 kernels and early 2.6
153kernels will find out that this method 3 is essentially similar to what
154was done there. Two significant differences are:
155* Probing is only one way to instantiate I2C devices now, while it was the
156 only way back then. Where possible, methods 1 and 2 should be preferred.
157 Method 3 should only be used when there is no other way, as it can have
158 undesirable side effects.
159* I2C buses must now explicitly say which I2C driver classes can probe
160 them (by the means of the class bitfield), while all I2C buses were
161 probed by default back then. The default is an empty class which means
162 that no probing happens. The purpose of the class bitfield is to limit
163 the aforementioned undesirable side effects.
164
165Once again, method 3 should be avoided wherever possible. Explicit device
166instantiation (methods 1 and 2) is much preferred for it is safer and
167faster.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 6b9af7d479c2..c1a06f989cf7 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -207,15 +207,26 @@ You simply have to define a detect callback which will attempt to
207identify supported devices (returning 0 for supported ones and -ENODEV 207identify supported devices (returning 0 for supported ones and -ENODEV
208for unsupported ones), a list of addresses to probe, and a device type 208for unsupported ones), a list of addresses to probe, and a device type
209(or class) so that only I2C buses which may have that type of device 209(or class) so that only I2C buses which may have that type of device
210connected (and not otherwise enumerated) will be probed. The i2c 210connected (and not otherwise enumerated) will be probed. For example,
211core will then call you back as needed and will instantiate a device 211a driver for a hardware monitoring chip for which auto-detection is
212for you for every successful detection. 212needed would set its class to I2C_CLASS_HWMON, and only I2C adapters
213with a class including I2C_CLASS_HWMON would be probed by this driver.
214Note that the absence of matching classes does not prevent the use of
215a device of that type on the given I2C adapter. All it prevents is
216auto-detection; explicit instantiation of devices is still possible.
213 217
214Note that this mechanism is purely optional and not suitable for all 218Note that this mechanism is purely optional and not suitable for all
215devices. You need some reliable way to identify the supported devices 219devices. You need some reliable way to identify the supported devices
216(typically using device-specific, dedicated identification registers), 220(typically using device-specific, dedicated identification registers),
217otherwise misdetections are likely to occur and things can get wrong 221otherwise misdetections are likely to occur and things can get wrong
218quickly. 222quickly. Keep in mind that the I2C protocol doesn't include any
223standard way to detect the presence of a chip at a given address, let
224alone a standard way to identify devices. Even worse is the lack of
225semantics associated to bus transfers, which means that the same
226transfer can be seen as a read operation by a chip and as a write
227operation by another chip. For these reasons, explicit device
228instantiation should always be preferred to auto-detection where
229possible.
219 230
220 231
221Device Deletion 232Device Deletion
diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
42 hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg 42 hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
43 you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries. 43 you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
44 44
45 (3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu 45 (3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
46 46
474. Boot up Linux or Windows guests: 474. Boot up Linux or Windows guests:
48 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy. 48 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
index 864ff3283780..6d40f00b358c 100644
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,49 @@ Partitions and P_Keys
24 The P_Key for any interface is given by the "pkey" file, and the 24 The P_Key for any interface is given by the "pkey" file, and the
25 main interface for a subinterface is in "parent." 25 main interface for a subinterface is in "parent."
26 26
27Datagram vs Connected modes
28
29 The IPoIB driver supports two modes of operation: datagram and
30 connected. The mode is set and read through an interface's
31 /sys/class/net/<intf name>/mode file.
32
33 In datagram mode, the IB UD (Unreliable Datagram) transport is used
34 and so the interface MTU has is equal to the IB L2 MTU minus the
35 IPoIB encapsulation header (4 bytes). For example, in a typical IB
36 fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
37
38 In connected mode, the IB RC (Reliable Connected) transport is used.
39 Connected mode is to takes advantage of the connected nature of the
40 IB transport and allows an MTU up to the maximal IP packet size of
41 64K, which reduces the number of IP packets needed for handling
42 large UDP datagrams, TCP segments, etc and increases the performance
43 for large messages.
44
45 In connected mode, the interface's UD QP is still used for multicast
46 and communication with peers that don't support connected mode. In
47 this case, RX emulation of ICMP PMTU packets is used to cause the
48 networking stack to use the smaller UD MTU for these neighbours.
49
50Stateless offloads
51
52 If the IB HW supports IPoIB stateless offloads, IPoIB advertises
53 TCP/IP checksum and/or Large Send (LSO) offloading capability to the
54 network stack.
55
56 Large Receive (LRO) offloading is also implemented and may be turned
57 on/off using ethtool calls. Currently LRO is supported only for
58 checksum offload capable devices.
59
60 Stateless offloads are supported only in datagram mode.
61
62Interrupt moderation
63
64 If the underlying IB device supports CQ event moderation, one can
65 use ethtool to set interrupt mitigation parameters and thus reduce
66 the overhead incurred by handling interrupts. The main code path of
67 IPoIB doesn't use events for TX completion signaling so only RX
68 moderation is supported.
69
27Debugging Information 70Debugging Information
28 71
29 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set 72 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
@@ -55,3 +98,5 @@ References
55 http://ietf.org/rfc/rfc4391.txt 98 http://ietf.org/rfc/rfc4391.txt
56 IP over InfiniBand (IPoIB) Architecture (RFC 4392) 99 IP over InfiniBand (IPoIB) Architecture (RFC 4392)
57 http://ietf.org/rfc/rfc4392.txt 100 http://ietf.org/rfc/rfc4392.txt
101 IP over InfiniBand: Connected Mode (RFC 4755)
102 http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/input/bcm5974.txt b/Documentation/input/bcm5974.txt
new file mode 100644
index 000000000000..5e22dcf6d48d
--- /dev/null
+++ b/Documentation/input/bcm5974.txt
@@ -0,0 +1,65 @@
1BCM5974 Driver (bcm5974)
2------------------------
3 Copyright (C) 2008-2009 Henrik Rydberg <rydberg@euromail.se>
4
5The USB initialization and package decoding was made by Scott Shawcroft as
6part of the touchd user-space driver project:
7 Copyright (C) 2008 Scott Shawcroft (scott.shawcroft@gmail.com)
8
9The BCM5974 driver is based on the appletouch driver:
10 Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com)
11 Copyright (C) 2005 Johannes Berg (johannes@sipsolutions.net)
12 Copyright (C) 2005 Stelian Pop (stelian@popies.net)
13 Copyright (C) 2005 Frank Arnold (frank@scirocco-5v-turbo.de)
14 Copyright (C) 2005 Peter Osterlund (petero2@telia.com)
15 Copyright (C) 2005 Michael Hanselmann (linux-kernel@hansmi.ch)
16 Copyright (C) 2006 Nicolas Boichat (nicolas@boichat.ch)
17
18This driver adds support for the multi-touch trackpad on the new Apple
19Macbook Air and Macbook Pro laptops. It replaces the appletouch driver on
20those computers, and integrates well with the synaptics driver of the Xorg
21system.
22
23Known to work on Macbook Air, Macbook Pro Penryn and the new unibody
24Macbook 5 and Macbook Pro 5.
25
26Usage
27-----
28
29The driver loads automatically for the supported usb device ids, and
30becomes available both as an event device (/dev/input/event*) and as a
31mouse via the mousedev driver (/dev/input/mice).
32
33USB Race
34--------
35
36The Apple multi-touch trackpads report both mouse and keyboard events via
37different interfaces of the same usb device. This creates a race condition
38with the HID driver, which, if not told otherwise, will find the standard
39HID mouse and keyboard, and claim the whole device. To remedy, the usb
40product id must be listed in the mouse_ignore list of the hid driver.
41
42Debug output
43------------
44
45To ease the development for new hardware version, verbose packet output can
46be switched on with the debug kernel module parameter. The range [1-9]
47yields different levels of verbosity. Example (as root):
48
49echo -n 9 > /sys/module/bcm5974/parameters/debug
50
51tail -f /var/log/debug
52
53echo -n 0 > /sys/module/bcm5974/parameters/debug
54
55Trivia
56------
57
58The driver was developed at the ubuntu forums in June 2008 [1], and now has
59a more permanent home at bitmath.org [2].
60
61Links
62-----
63
64[1] http://ubuntuforums.org/showthread.php?t=840040
65[2] http://http://bitmath.org/code/
diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt
new file mode 100644
index 000000000000..9f09557aea39
--- /dev/null
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -0,0 +1,140 @@
1Multi-touch (MT) Protocol
2-------------------------
3 Copyright (C) 2009 Henrik Rydberg <rydberg@euromail.se>
4
5
6Introduction
7------------
8
9In order to utilize the full power of the new multi-touch devices, a way to
10report detailed finger data to user space is needed. This document
11describes the multi-touch (MT) protocol which allows kernel drivers to
12report details for an arbitrary number of fingers.
13
14
15Usage
16-----
17
18Anonymous finger details are sent sequentially as separate packets of ABS
19events. Only the ABS_MT events are recognized as part of a finger
20packet. The end of a packet is marked by calling the input_mt_sync()
21function, which generates a SYN_MT_REPORT event. The end of multi-touch
22transfer is marked by calling the usual input_sync() function.
23
24A set of ABS_MT events with the desired properties is defined. The events
25are divided into categories, to allow for partial implementation. The
26minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
27ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked. If the
28device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size
29of the approaching finger. Anisotropy and direction may be specified with
30ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. Devices with
31more granular information may specify general shapes as blobs, i.e., as a
32sequence of rectangular shapes grouped together by an
33ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE may be used to specify
34whether the touching tool is a finger or a pen or something else.
35
36
37Event Semantics
38---------------
39
40The word "contact" is used to describe a tool which is in direct contact
41with the surface. A finger, a pen or a rubber all classify as contacts.
42
43ABS_MT_TOUCH_MAJOR
44
45The length of the major axis of the contact. The length should be given in
46surface units. If the surface has an X times Y resolution, the largest
47possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal.
48
49ABS_MT_TOUCH_MINOR
50
51The length, in surface units, of the minor axis of the contact. If the
52contact is circular, this event can be omitted.
53
54ABS_MT_WIDTH_MAJOR
55
56The length, in surface units, of the major axis of the approaching
57tool. This should be understood as the size of the tool itself. The
58orientation of the contact and the approaching tool are assumed to be the
59same.
60
61ABS_MT_WIDTH_MINOR
62
63The length, in surface units, of the minor axis of the approaching
64tool. Omit if circular.
65
66The above four values can be used to derive additional information about
67the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates
68the notion of pressure. The fingers of the hand and the palm all have
69different characteristic widths [1].
70
71ABS_MT_ORIENTATION
72
73The orientation of the ellipse. The value should describe half a revolution
74clockwise around the touch center. The scale of the value is arbitrary, but
75zero should be returned for an ellipse aligned along the Y axis of the
76surface. As an example, an index finger placed straight onto the axis could
77return zero orientation, something negative when twisted to the left, and
78something positive when twisted to the right. This value can be omitted if
79the touching object is circular, or if the information is not available in
80the kernel driver.
81
82ABS_MT_POSITION_X
83
84The surface X coordinate of the center of the touching ellipse.
85
86ABS_MT_POSITION_Y
87
88The surface Y coordinate of the center of the touching ellipse.
89
90ABS_MT_TOOL_TYPE
91
92The type of approaching tool. A lot of kernel drivers cannot distinguish
93between different tool types, such as a finger or a pen. In such cases, the
94event should be omitted. The protocol currently supports MT_TOOL_FINGER and
95MT_TOOL_PEN [2].
96
97ABS_MT_BLOB_ID
98
99The BLOB_ID groups several packets together into one arbitrarily shaped
100contact. This is a low-level anonymous grouping, and should not be confused
101with the high-level contactID, explained below. Most kernel drivers will
102not have this capability, and can safely omit the event.
103
104
105Finger Tracking
106---------------
107
108The kernel driver should generate an arbitrary enumeration of the set of
109anonymous contacts currently on the surface. The order in which the packets
110appear in the event stream is not important.
111
112The process of finger tracking, i.e., to assign a unique contactID to each
113initiated contact on the surface, is left to user space; preferably the
114multi-touch X driver [3]. In that driver, the contactID stays the same and
115unique until the contact vanishes (when the finger leaves the surface). The
116problem of assigning a set of anonymous fingers to a set of identified
117fingers is a euclidian bipartite matching problem at each event update, and
118relies on a sufficiently rapid update rate.
119
120Notes
121-----
122
123In order to stay compatible with existing applications, the data
124reported in a finger packet must not be recognized as single-touch
125events. In addition, all finger data must bypass input filtering,
126since subsequent events of the same type refer to different fingers.
127
128The first kernel driver to utilize the MT protocol is the bcm5974 driver,
129where examples can be found.
130
131[1] With the extension ABS_MT_APPROACH_X and ABS_MT_APPROACH_Y, the
132difference between the contact position and the approaching tool position
133could be used to derive tilt.
134[2] The list can of course be extended.
135[3] The multi-touch X driver is currently in the prototyping stage. At the
136time of writing (April 2009), the MT protocol is not yet merged, and the
137prototype implements finger matching, basic mouse support and two-finger
138scrolling. The project aims at improving the quality of current multi-touch
139functionality available in the synaptics X driver, and in addition
140implement more advanced gestures.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index f1d639903325..1f779a25c703 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -122,10 +122,8 @@ Code Seq# Include File Comments
122'c' 00-7F linux/coda.h conflict! 122'c' 00-7F linux/coda.h conflict!
123'c' 80-9F arch/s390/include/asm/chsc.h 123'c' 80-9F arch/s390/include/asm/chsc.h
124'd' 00-FF linux/char/drm/drm/h conflict! 124'd' 00-FF linux/char/drm/drm/h conflict!
125'd' 00-DF linux/video_decoder.h conflict!
126'd' F0-FF linux/digi1.h 125'd' F0-FF linux/digi1.h
127'e' all linux/digi1.h conflict! 126'e' all linux/digi1.h conflict!
128'e' 00-1F linux/video_encoder.h conflict!
129'e' 00-1F net/irda/irtty.h conflict! 127'e' 00-1F net/irda/irtty.h conflict!
130'f' 00-1F linux/ext2_fs.h 128'f' 00-1F linux/ext2_fs.h
131'h' 00-7F Charon filesystem 129'h' 00-7F Charon filesystem
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index 9fee5f2e5c62..5a2d69989a8c 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -2,8 +2,14 @@
2 - this file (info on ISDN implementation for Linux) 2 - this file (info on ISDN implementation for Linux)
3CREDITS 3CREDITS
4 - list of the kind folks that brought you this stuff. 4 - list of the kind folks that brought you this stuff.
5HiSax.cert
6 - information about the ITU approval certification of the HiSax driver.
5INTERFACE 7INTERFACE
6 - description of Linklevel and Hardwarelevel ISDN interface. 8 - description of isdn4linux Link Level and Hardware Level interfaces.
9INTERFACE.fax
10 - description of the fax subinterface of isdn4linux.
11INTERFACE.CAPI
12 - description of kernel CAPI Link Level to Hardware Level interface.
7README 13README
8 - general info on what you need and what to do for Linux ISDN. 14 - general info on what you need and what to do for Linux ISDN.
9README.FAQ 15README.FAQ
@@ -12,6 +18,8 @@ README.audio
12 - info for running audio over ISDN. 18 - info for running audio over ISDN.
13README.fax 19README.fax
14 - info for using Fax over ISDN. 20 - info for using Fax over ISDN.
21README.gigaset
22 - info on the drivers for Siemens Gigaset ISDN adapters.
15README.icn 23README.icn
16 - info on the ICN-ISDN-card and its driver. 24 - info on the ICN-ISDN-card and its driver.
17README.HiSax 25README.HiSax
@@ -37,7 +45,8 @@ README.diversion
37README.sc 45README.sc
38 - info on driver for Spellcaster cards. 46 - info on driver for Spellcaster cards.
39README.x25 47README.x25
40 _ info for running X.25 over ISDN. 48 - info for running X.25 over ISDN.
41README.hysdn 49README.hysdn
42 - info on driver for Hypercope active HYSDN cards 50 - info on driver for Hypercope active HYSDN cards
43 51README.mISDN
52 - info on the Modular ISDN subsystem (mISDN).
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
new file mode 100644
index 000000000000..786d619b36e5
--- /dev/null
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -0,0 +1,213 @@
1Kernel CAPI Interface to Hardware Drivers
2-----------------------------------------
3
41. Overview
5
6From the CAPI 2.0 specification:
7COMMON-ISDN-API (CAPI) is an application programming interface standard used
8to access ISDN equipment connected to basic rate interfaces (BRI) and primary
9rate interfaces (PRI).
10
11Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI
12hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI
13lingo) with Kernel CAPI to indicate their readiness to provide their service
14to CAPI applications. CAPI applications also register with Kernel CAPI,
15requesting association with a CAPI device. Kernel CAPI then dispatches the
16application registration to an available device, forwarding it to the
17corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both
18directions between the application and the hardware driver.
19
20Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
21This standard is freely available from http://www.capi.org.
22
23
242. Driver and Device Registration
25
26CAPI drivers optionally register themselves with Kernel CAPI by calling the
27Kernel CAPI function register_capi_driver() with a pointer to a struct
28capi_driver. This structure must be filled with the name and revision of the
29driver, and optionally a pointer to a callback function, add_card(). The
30registration can be revoked by calling the function unregister_capi_driver()
31with a pointer to the same struct capi_driver.
32
33CAPI drivers must register each of the ISDN devices they control with Kernel
34CAPI by calling the Kernel CAPI function attach_capi_ctr() with a pointer to a
35struct capi_ctr before they can be used. This structure must be filled with
36the names of the driver and controller, and a number of callback function
37pointers which are subsequently used by Kernel CAPI for communicating with the
38driver. The registration can be revoked by calling the function
39detach_capi_ctr() with a pointer to the same struct capi_ctr.
40
41Before the device can be actually used, the driver must fill in the device
42information fields 'manu', 'version', 'profile' and 'serial' in the capi_ctr
43structure of the device, and signal its readiness by calling capi_ctr_ready().
44From then on, Kernel CAPI may call the registered callback functions for the
45device.
46
47If the device becomes unusable for any reason (shutdown, disconnect ...), the
48driver has to call capi_ctr_reseted(). This will prevent further calls to the
49callback functions by Kernel CAPI.
50
51
523. Application Registration and Communication
53
54Kernel CAPI forwards registration requests from applications (calls to CAPI
55operation CAPI_REGISTER) to an appropriate hardware driver by calling its
56register_appl() callback function. A unique Application ID (ApplID, u16) is
57allocated by Kernel CAPI and passed to register_appl() along with the
58parameter structure provided by the application. This is analogous to the
59open() operation on regular files or character devices.
60
61After a successful return from register_appl(), CAPI messages from the
62application may be passed to the driver for the device via calls to the
63send_message() callback function. The CAPI message to send is stored in the
64data portion of an skb. Conversely, the driver may call Kernel CAPI's
65capi_ctr_handle_message() function to pass a received CAPI message to Kernel
66CAPI for forwarding to an application, specifying its ApplID.
67
68Deregistration requests (CAPI operation CAPI_RELEASE) from applications are
69forwarded as calls to the release_appl() callback function, passing the same
70ApplID as with register_appl(). After return from release_appl(), no CAPI
71messages for that application may be passed to or from the device anymore.
72
73
744. Data Structures
75
764.1 struct capi_driver
77
78This structure describes a Kernel CAPI driver itself. It is used in the
79register_capi_driver() and unregister_capi_driver() functions, and contains
80the following non-private fields, all to be set by the driver before calling
81register_capi_driver():
82
83char name[32]
84 the name of the driver, as a zero-terminated ASCII string
85char revision[32]
86 the revision number of the driver, as a zero-terminated ASCII string
87int (*add_card)(struct capi_driver *driver, capicardparams *data)
88 a callback function pointer (may be NULL)
89
90
914.2 struct capi_ctr
92
93This structure describes an ISDN device (controller) handled by a Kernel CAPI
94driver. After registration via the attach_capi_ctr() function it is passed to
95all controller specific lower layer interface and callback functions to
96identify the controller to operate on.
97
98It contains the following non-private fields:
99
100- to be set by the driver before calling attach_capi_ctr():
101
102struct module *owner
103 pointer to the driver module owning the device
104
105void *driverdata
106 an opaque pointer to driver specific data, not touched by Kernel CAPI
107
108char name[32]
109 the name of the controller, as a zero-terminated ASCII string
110
111char *driver_name
112 the name of the driver, as a zero-terminated ASCII string
113
114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
115 (optional) pointer to a callback function for sending firmware and
116 configuration data to the device
117
118void (*reset_ctr)(struct capi_ctr *ctrlr)
119 pointer to a callback function for performing a reset on the device,
120 releasing all registered applications
121
122void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
123 capi_register_params *rparam)
124void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
125 pointers to callback functions for registration and deregistration of
126 applications with the device
127
128u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
129 pointer to a callback function for sending a CAPI message to the
130 device
131
132char *(*procinfo)(struct capi_ctr *ctrlr)
133 pointer to a callback function returning the entry for the device in
134 the CAPI controller info table, /proc/capi/controller
135
136read_proc_t *ctr_read_proc
137 pointer to the read_proc callback function for the device's proc file
138 system entry, /proc/capi/controllers/<n>; will be called with a
139 pointer to the device's capi_ctr structure as the last (data) argument
140
141- to be filled in before calling capi_ctr_ready():
142
143u8 manu[CAPI_MANUFACTURER_LEN]
144 value to return for CAPI_GET_MANUFACTURER
145
146capi_version version
147 value to return for CAPI_GET_VERSION
148
149capi_profile profile
150 value to return for CAPI_GET_PROFILE
151
152u8 serial[CAPI_SERIAL_LEN]
153 value to return for CAPI_GET_SERIAL
154
155
1565. Lower Layer Interface Functions
157
158(declared in <linux/isdn/capilli.h>)
159
160void register_capi_driver(struct capi_driver *drvr)
161void unregister_capi_driver(struct capi_driver *drvr)
162 register/unregister a driver with Kernel CAPI
163
164int attach_capi_ctr(struct capi_ctr *ctrlr)
165int detach_capi_ctr(struct capi_ctr *ctrlr)
166 register/unregister a device (controller) with Kernel CAPI
167
168void capi_ctr_ready(struct capi_ctr *ctrlr)
169void capi_ctr_reseted(struct capi_ctr *ctrlr)
170 signal controller ready/not ready
171
172void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
173void capi_ctr_resume_output(struct capi_ctr *ctrlr)
174 signal suspend/resume
175
176void capi_ctr_handle_message(struct capi_ctr * ctrlr, u16 applid,
177 struct sk_buff *skb)
178 pass a received CAPI message to Kernel CAPI
179 for forwarding to the specified application
180
181
1826. Helper Functions and Macros
183
184Library functions (from <linux/isdn/capilli.h>):
185
186void capilib_new_ncci(struct list_head *head, u16 applid,
187 u32 ncci, u32 winsize)
188void capilib_free_ncci(struct list_head *head, u16 applid, u32 ncci)
189void capilib_release_appl(struct list_head *head, u16 applid)
190void capilib_release(struct list_head *head)
191void capilib_data_b3_conf(struct list_head *head, u16 applid,
192 u32 ncci, u16 msgid)
193u16 capilib_data_b3_req(struct list_head *head, u16 applid,
194 u32 ncci, u16 msgid)
195
196
197Macros to extract/set element values from/in a CAPI message header
198(from <linux/isdn/capiutil.h>):
199
200Get Macro Set Macro Element (Type)
201
202CAPIMSG_LEN(m) CAPIMSG_SETLEN(m, len) Total Length (u16)
203CAPIMSG_APPID(m) CAPIMSG_SETAPPID(m, applid) ApplID (u16)
204CAPIMSG_COMMAND(m) CAPIMSG_SETCOMMAND(m,cmd) Command (u8)
205CAPIMSG_SUBCOMMAND(m) CAPIMSG_SETSUBCOMMAND(m, cmd) Subcommand (u8)
206CAPIMSG_CMD(m) - Command*256
207 + Subcommand (u16)
208CAPIMSG_MSGID(m) CAPIMSG_SETMSGID(m, msgid) Message Number (u16)
209
210CAPIMSG_CONTROL(m) CAPIMSG_SETCONTROL(m, contr) Controller/PLCI/NCCI
211 (u32)
212CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16)
213
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 55b2852904a4..02c0e9341dd8 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -61,24 +61,28 @@ GigaSet 307x Device Driver
61 --------------------- 61 ---------------------
622.1. Modules 622.1. Modules
63 ------- 63 -------
64 To get the device working, you have to load the proper kernel module. You 64 For the devices to work, the proper kernel modules have to be loaded.
65 can do this using 65 This normally happens automatically when the system detects the USB
66 modprobe modulename 66 device (base, M105) or when the line discipline is attached (M101). It
67 where modulename is ser_gigaset (M101), usb_gigaset (M105), or 67 can also be triggered manually using the modprobe(8) command, for example
68 bas_gigaset (direct USB connection to the base). 68 for troubleshooting or to pass module parameters.
69 69
70 The module ser_gigaset provides a serial line discipline N_GIGASET_M101 70 The module ser_gigaset provides a serial line discipline N_GIGASET_M101
71 which drives the device through the regular serial line driver. To use it, 71 which drives the device through the regular serial line driver. It must
72 run the Gigaset M101 daemon "gigasetm101d" (also available from 72 be attached to the serial line to which the M101 is connected with the
73 http://sourceforge.net/projects/gigaset307x/) with the device file of the 73 ldattach(8) command (requires util-linux-ng release 2.14 or later), for
74 RS232 port to the M101 as an argument, for example: 74 example:
75 gigasetm101d /dev/ttyS1 75 ldattach GIGASET_M101 /dev/ttyS1
76 This will open the device file, set its line discipline to N_GIGASET_M101, 76 This will open the device file, attach the line discipline to it, and
77 and then sleep in the background, keeping the device open so that the 77 then sleep in the background, keeping the device open so that the line
78 line discipline remains active. To deactivate it, kill the daemon, for 78 discipline remains active. To deactivate it, kill the daemon, for example
79 example with 79 with
80 killall gigasetm101d 80 killall ldattach
81 before disconnecting the device. 81 before disconnecting the device. To have this happen automatically at
82 system startup/shutdown on an LSB compatible system, create and activate
83 an appropriate LSB startup script /etc/init.d/gigaset. (The init name
84 'gigaset' is officially assigned to this project by LANANA.)
85 Alternatively, just add the 'ldattach' command line to /etc/rc.local.
82 86
832.2. Device nodes for user space programs 872.2. Device nodes for user space programs
84 ------------------------------------ 88 ------------------------------------
@@ -194,10 +198,11 @@ GigaSet 307x Device Driver
194 operation (for wireless access to the base), but are needed for access 198 operation (for wireless access to the base), but are needed for access
195 to the M105's own configuration mode (registration to the base, baudrate 199 to the M105's own configuration mode (registration to the base, baudrate
196 and line format settings, device status queries) via the gigacontr 200 and line format settings, device status queries) via the gigacontr
197 utility. Their use is disabled in the driver by default for safety 201 utility. Their use is controlled by the kernel configuration option
198 reasons but can be enabled by setting the kernel configuration option 202 "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you
199 "Support for undocumented USB requests" (GIGASET_UNDOCREQ) to "Y" and 203 encounter error code -ENOTTY when trying to use some features of the
200 recompiling. 204 M105, try setting that option to "y" via 'make {x,menu}config' and
205 recompiling the driver.
201 206
202 207
2033. Troubleshooting 2083. Troubleshooting
@@ -228,6 +233,13 @@ GigaSet 307x Device Driver
228 Solution: 233 Solution:
229 Select Unimodem mode for all DECT data adapters. (see section 2.4.) 234 Select Unimodem mode for all DECT data adapters. (see section 2.4.)
230 235
236 Problem:
237 You want to configure your USB DECT data adapter (M105) but gigacontr
238 reports an error: "/dev/ttyGU0: Inappropriate ioctl for device".
239 Solution:
240 Recompile the usb_gigaset driver with the kernel configuration option
241 CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.)
242
2313.2. Telling the driver to provide more information 2433.2. Telling the driver to provide more information
232 ---------------------------------------------- 244 ----------------------------------------------
233 Building the driver with the "Gigaset debugging" kernel configuration 245 Building the driver with the "Gigaset debugging" kernel configuration
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index 51104f9194a5..d76cfd8712e1 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles.
40 --- 6.7 Custom kbuild commands 40 --- 6.7 Custom kbuild commands
41 --- 6.8 Preprocessing linker scripts 41 --- 6.8 Preprocessing linker scripts
42 42
43 === 7 Kbuild Variables 43 === 7 Kbuild syntax for exported headers
44 === 8 Makefile language 44 --- 7.1 header-y
45 === 9 Credits 45 --- 7.2 objhdr-y
46 === 10 TODO 46 --- 7.3 destination-y
47 --- 7.4 unifdef-y (deprecated)
48
49 === 8 Kbuild Variables
50 === 9 Makefile language
51 === 10 Credits
52 === 11 TODO
47 53
48=== 1 Overview 54=== 1 Overview
49 55
@@ -310,6 +316,16 @@ more details, with real examples.
310 #arch/m68k/fpsp040/Makefile 316 #arch/m68k/fpsp040/Makefile
311 ldflags-y := -x 317 ldflags-y := -x
312 318
319 subdir-ccflags-y, subdir-asflags-y
320 The two flags listed above are similar to ccflags-y and as-falgs-y.
321 The difference is that the subdir- variants has effect for the kbuild
322 file where tey are present and all subdirectories.
323 Options specified using subdir-* are added to the commandline before
324 the options specified using the non-subdir variants.
325
326 Example:
327 subdir-ccflags-y := -Werror
328
313 CFLAGS_$@, AFLAGS_$@ 329 CFLAGS_$@, AFLAGS_$@
314 330
315 CFLAGS_$@ and AFLAGS_$@ only apply to commands in current 331 CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
@@ -1143,8 +1159,69 @@ When kbuild executes, the following steps are followed (roughly):
1143 The kbuild infrastructure for *lds file are used in several 1159 The kbuild infrastructure for *lds file are used in several
1144 architecture-specific files. 1160 architecture-specific files.
1145 1161
1162=== 7 Kbuild syntax for exported headers
1163
1164The kernel include a set of headers that is exported to userspace.
1165Many headers can be exported as-is but other headers requires a
1166minimal pre-processing before they are ready for user-space.
1167The pre-processing does:
1168- drop kernel specific annotations
1169- drop include of compiler.h
1170- drop all sections that is kernel internat (guarded by ifdef __KERNEL__)
1171
1172Each relevant directory contain a file name "Kbuild" which specify the
1173headers to be exported.
1174See subsequent chapter for the syntax of the Kbuild file.
1175
1176 --- 7.1 header-y
1177
1178 header-y specify header files to be exported.
1179
1180 Example:
1181 #include/linux/Kbuild
1182 header-y += usb/
1183 header-y += aio_abi.h
1184
1185 The convention is to list one file per line and
1186 preferably in alphabetic order.
1187
1188 header-y also specify which subdirectories to visit.
1189 A subdirectory is identified by a trailing '/' which
1190 can be seen in the example above for the usb subdirectory.
1191
1192 Subdirectories are visited before their parent directories.
1193
1194 --- 7.2 objhdr-y
1195
1196 objhdr-y specifies generated files to be exported.
1197 Generated files are special as they need to be looked
1198 up in another directory when doing 'make O=...' builds.
1199
1200 Example:
1201 #include/linux/Kbuild
1202 objhdr-y += version.h
1203
1204 --- 7.3 destination-y
1205
1206 When an architecture have a set of exported headers that needs to be
1207 exported to a different directory destination-y is used.
1208 destination-y specify the destination directory for all exported
1209 headers in the file where it is present.
1210
1211 Example:
1212 #arch/xtensa/platforms/s6105/include/platform/Kbuild
1213 destination-y := include/linux
1214
1215 In the example above all exported headers in the Kbuild file
1216 will be located in the directory "include/linux" when exported.
1217
1218
1219 --- 7.4 unifdef-y (deprecated)
1220
1221 unifdef-y is deprecated. A direct replacement is header-y.
1222
1146 1223
1147=== 7 Kbuild Variables 1224=== 8 Kbuild Variables
1148 1225
1149The top Makefile exports the following variables: 1226The top Makefile exports the following variables:
1150 1227
@@ -1206,7 +1283,7 @@ The top Makefile exports the following variables:
1206 INSTALL_MOD_STRIP will used as the option(s) to the strip command. 1283 INSTALL_MOD_STRIP will used as the option(s) to the strip command.
1207 1284
1208 1285
1209=== 8 Makefile language 1286=== 9 Makefile language
1210 1287
1211The kernel Makefiles are designed to be run with GNU Make. The Makefiles 1288The kernel Makefiles are designed to be run with GNU Make. The Makefiles
1212use only the documented features of GNU Make, but they do use many 1289use only the documented features of GNU Make, but they do use many
@@ -1225,14 +1302,14 @@ time the left-hand side is used.
1225There are some cases where "=" is appropriate. Usually, though, ":=" 1302There are some cases where "=" is appropriate. Usually, though, ":="
1226is the right choice. 1303is the right choice.
1227 1304
1228=== 9 Credits 1305=== 10 Credits
1229 1306
1230Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> 1307Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
1231Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> 1308Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
1232Updates by Sam Ravnborg <sam@ravnborg.org> 1309Updates by Sam Ravnborg <sam@ravnborg.org>
1233Language QA by Jan Engelhardt <jengelh@gmx.de> 1310Language QA by Jan Engelhardt <jengelh@gmx.de>
1234 1311
1235=== 10 TODO 1312=== 11 TODO
1236 1313
1237- Describe how kbuild supports shipped files with _shipped. 1314- Describe how kbuild supports shipped files with _shipped.
1238- Generating offset header files. 1315- Generating offset header files.
diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
index 026ec7d57384..4d04572b6549 100644
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -269,7 +269,10 @@ Use the argument mechanism to document members or constants.
269 269
270Inside a struct description, you can use the "private:" and "public:" 270Inside a struct description, you can use the "private:" and "public:"
271comment tags. Structure fields that are inside a "private:" area 271comment tags. Structure fields that are inside a "private:" area
272are not listed in the generated output documentation. 272are not listed in the generated output documentation. The "private:"
273and "public:" tags must begin immediately following a "/*" comment
274marker. They may optionally include comments between the ":" and the
275ending "*/" marker.
273 276
274Example: 277Example:
275 278
@@ -283,7 +286,7 @@ Example:
283struct my_struct { 286struct my_struct {
284 int a; 287 int a;
285 int b; 288 int b;
286/* private: */ 289/* private: internal use only */
287 int c; 290 int c;
288}; 291};
289 292
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 54f21a5c262b..e87bdbfbcc75 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -17,6 +17,12 @@ are specified on the kernel command line with the module name plus
17 17
18 usbcore.blinkenlights=1 18 usbcore.blinkenlights=1
19 19
20Hyphens (dashes) and underscores are equivalent in parameter names, so
21 log_buf_len=1M print-fatal-signals=1
22can also be entered as
23 log-buf-len=1M print_fatal_signals=1
24
25
20This document may not be entirely up to date and comprehensive. The command 26This document may not be entirely up to date and comprehensive. The command
21"modinfo -p ${modulename}" shows a current list of all parameters of a loadable 27"modinfo -p ${modulename}" shows a current list of all parameters of a loadable
22module. Loadable modules, after being loaded into the running kernel, also 28module. Loadable modules, after being loaded into the running kernel, also
@@ -44,11 +50,13 @@ parameter is applicable:
44 FB The frame buffer device is enabled. 50 FB The frame buffer device is enabled.
45 HW Appropriate hardware is enabled. 51 HW Appropriate hardware is enabled.
46 IA-64 IA-64 architecture is enabled. 52 IA-64 IA-64 architecture is enabled.
53 IMA Integrity measurement architecture is enabled.
47 IOSCHED More than one I/O scheduler is enabled. 54 IOSCHED More than one I/O scheduler is enabled.
48 IP_PNP IP DHCP, BOOTP, or RARP is enabled. 55 IP_PNP IP DHCP, BOOTP, or RARP is enabled.
49 ISAPNP ISA PnP code is enabled. 56 ISAPNP ISA PnP code is enabled.
50 ISDN Appropriate ISDN support is enabled. 57 ISDN Appropriate ISDN support is enabled.
51 JOY Appropriate joystick support is enabled. 58 JOY Appropriate joystick support is enabled.
59 KMEMTRACE kmemtrace is enabled.
52 LIBATA Libata driver is enabled 60 LIBATA Libata driver is enabled
53 LP Printer support is enabled. 61 LP Printer support is enabled.
54 LOOP Loopback device support is enabled. 62 LOOP Loopback device support is enabled.
@@ -132,7 +140,7 @@ and is between 256 and 4096 characters. It is defined in the file
132./include/asm/setup.h as COMMAND_LINE_SIZE. 140./include/asm/setup.h as COMMAND_LINE_SIZE.
133 141
134 142
135 acpi= [HW,ACPI,X86-64,i386] 143 acpi= [HW,ACPI,X86]
136 Advanced Configuration and Power Interface 144 Advanced Configuration and Power Interface
137 Format: { force | off | ht | strict | noirq | rsdt } 145 Format: { force | off | ht | strict | noirq | rsdt }
138 force -- enable ACPI if default was off 146 force -- enable ACPI if default was off
@@ -151,60 +159,6 @@ and is between 256 and 4096 characters. It is defined in the file
151 1,0: use 1st APIC table 159 1,0: use 1st APIC table
152 default: 0 160 default: 0
153 161
154 acpi_sleep= [HW,ACPI] Sleep options
155 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
156 old_ordering, s4_nonvs }
157 See Documentation/power/video.txt for information on
158 s3_bios and s3_mode.
159 s3_beep is for debugging; it makes the PC's speaker beep
160 as soon as the kernel's real-mode entry point is called.
161 s4_nohwsig prevents ACPI hardware signature from being
162 used during resume from hibernation.
163 old_ordering causes the ACPI 1.0 ordering of the _PTS
164 control method, with respect to putting devices into
165 low power states, to be enforced (the ACPI 2.0 ordering
166 of _PTS is used by default).
167 s4_nonvs prevents the kernel from saving/restoring the
168 ACPI NVS memory during hibernation.
169
170 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
171 Format: { level | edge | high | low }
172
173 acpi_irq_balance [HW,ACPI]
174 ACPI will balance active IRQs
175 default in APIC mode
176
177 acpi_irq_nobalance [HW,ACPI]
178 ACPI will not move active IRQs (default)
179 default in PIC mode
180
181 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
182 use by PCI
183 Format: <irq>,<irq>...
184
185 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
186 Format: <irq>,<irq>...
187
188 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
189
190 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
191 Format: To spoof as Windows 98: ="Microsoft Windows"
192
193 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
194 acpi_osi="string1" # add string1 -- only one string
195 acpi_osi="!string2" # remove built-in string2
196 acpi_osi= # disable all strings
197
198 acpi_serialize [HW,ACPI] force serialization of AML methods
199
200 acpi_skip_timer_override [HW,ACPI]
201 Recognize and ignore IRQ0/pin2 Interrupt Override.
202 For broken nForce2 BIOS resulting in XT-PIC timer.
203 acpi_use_timer_override [HW,ACPI]
204 Use timer override. For some broken Nvidia NF5 boards
205 that require a timer override, but don't have
206 HPET
207
208 acpi_backlight= [HW,ACPI] 162 acpi_backlight= [HW,ACPI]
209 acpi_backlight=vendor 163 acpi_backlight=vendor
210 acpi_backlight=video 164 acpi_backlight=video
@@ -212,11 +166,6 @@ and is between 256 and 4096 characters. It is defined in the file
212 (e.g. thinkpad_acpi, sony_acpi, etc.) instead 166 (e.g. thinkpad_acpi, sony_acpi, etc.) instead
213 of the ACPI video.ko driver. 167 of the ACPI video.ko driver.
214 168
215 acpi_display_output= [HW,ACPI]
216 acpi_display_output=vendor
217 acpi_display_output=video
218 See above.
219
220 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] 169 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG]
221 acpi.debug_level= [HW,ACPI,ACPI_DEBUG] 170 acpi.debug_level= [HW,ACPI,ACPI_DEBUG]
222 Format: <int> 171 Format: <int>
@@ -245,6 +194,41 @@ and is between 256 and 4096 characters. It is defined in the file
245 unusable. The "log_buf_len" parameter may be useful 194 unusable. The "log_buf_len" parameter may be useful
246 if you need to capture more output. 195 if you need to capture more output.
247 196
197 acpi_display_output= [HW,ACPI]
198 acpi_display_output=vendor
199 acpi_display_output=video
200 See above.
201
202 acpi_irq_balance [HW,ACPI]
203 ACPI will balance active IRQs
204 default in APIC mode
205
206 acpi_irq_nobalance [HW,ACPI]
207 ACPI will not move active IRQs (default)
208 default in PIC mode
209
210 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
211 Format: <irq>,<irq>...
212
213 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
214 use by PCI
215 Format: <irq>,<irq>...
216
217 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
218
219 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
220 Format: To spoof as Windows 98: ="Microsoft Windows"
221
222 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
223 acpi_osi="string1" # add string1 -- only one string
224 acpi_osi="!string2" # remove built-in string2
225 acpi_osi= # disable all strings
226
227 acpi_pm_good [X86]
228 Override the pmtimer bug detection: force the kernel
229 to assume that this machine's pmtimer latches its value
230 and always returns good values.
231
248 acpi.power_nocheck= [HW,ACPI] 232 acpi.power_nocheck= [HW,ACPI]
249 Format: 1/0 enable/disable the check of power state. 233 Format: 1/0 enable/disable the check of power state.
250 On some bogus BIOS the _PSC object/_STA object of 234 On some bogus BIOS the _PSC object/_STA object of
@@ -253,30 +237,57 @@ and is between 256 and 4096 characters. It is defined in the file
253 power state again in power transition. 237 power state again in power transition.
254 1 : disable the power state check 238 1 : disable the power state check
255 239
256 acpi_pm_good [X86-32,X86-64] 240 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
257 Override the pmtimer bug detection: force the kernel 241 Format: { level | edge | high | low }
258 to assume that this machine's pmtimer latches its value
259 and always returns good values.
260 242
261 agp= [AGP] 243 acpi_serialize [HW,ACPI] force serialization of AML methods
262 { off | try_unsupported }
263 off: disable AGP support
264 try_unsupported: try to drive unsupported chipsets
265 (may crash computer or cause data corruption)
266 244
267 enable_timer_pin_1 [i386,x86-64] 245 acpi_skip_timer_override [HW,ACPI]
268 Enable PIN 1 of APIC timer 246 Recognize and ignore IRQ0/pin2 Interrupt Override.
269 Can be useful to work around chipset bugs 247 For broken nForce2 BIOS resulting in XT-PIC timer.
270 (in particular on some ATI chipsets).
271 The kernel tries to set a reasonable default.
272 248
273 disable_timer_pin_1 [i386,x86-64] 249 acpi_sleep= [HW,ACPI] Sleep options
274 Disable PIN 1 of APIC timer 250 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
275 Can be useful to work around chipset bugs. 251 old_ordering, s4_nonvs }
252 See Documentation/power/video.txt for information on
253 s3_bios and s3_mode.
254 s3_beep is for debugging; it makes the PC's speaker beep
255 as soon as the kernel's real-mode entry point is called.
256 s4_nohwsig prevents ACPI hardware signature from being
257 used during resume from hibernation.
258 old_ordering causes the ACPI 1.0 ordering of the _PTS
259 control method, with respect to putting devices into
260 low power states, to be enforced (the ACPI 2.0 ordering
261 of _PTS is used by default).
262 s4_nonvs prevents the kernel from saving/restoring the
263 ACPI NVS memory during hibernation.
264
265 acpi_use_timer_override [HW,ACPI]
266 Use timer override. For some broken Nvidia NF5 boards
267 that require a timer override, but don't have HPET
268
269 acpi_enforce_resources= [ACPI]
270 { strict | lax | no }
271 Check for resource conflicts between native drivers
272 and ACPI OperationRegions (SystemIO and SystemMemory
273 only). IO ports and memory declared in ACPI might be
274 used by the ACPI subsystem in arbitrary AML code and
275 can interfere with legacy drivers.
276 strict (default): access to resources claimed by ACPI
277 is denied; legacy drivers trying to access reserved
278 resources will fail to bind to device using them.
279 lax: access to resources claimed by ACPI is allowed;
280 legacy drivers trying to access reserved resources
281 will bind successfully but a warning message is logged.
282 no: ACPI OperationRegions are not marked as reserved,
283 no further checks are performed.
276 284
277 ad1848= [HW,OSS] 285 ad1848= [HW,OSS]
278 Format: <io>,<irq>,<dma>,<dma2>,<type> 286 Format: <io>,<irq>,<dma>,<dma2>,<type>
279 287
288 add_efi_memmap [EFI; X86] Include EFI memory map in
289 kernel's map of available physical RAM.
290
280 advansys= [HW,SCSI] 291 advansys= [HW,SCSI]
281 See header of drivers/scsi/advansys.c. 292 See header of drivers/scsi/advansys.c.
282 293
@@ -287,6 +298,12 @@ and is between 256 and 4096 characters. It is defined in the file
287 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 298 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
288 See also header of sound/oss/aedsp16.c. 299 See also header of sound/oss/aedsp16.c.
289 300
301 agp= [AGP]
302 { off | try_unsupported }
303 off: disable AGP support
304 try_unsupported: try to drive unsupported chipsets
305 (may crash computer or cause data corruption)
306
290 aha152x= [HW,SCSI] 307 aha152x= [HW,SCSI]
291 See Documentation/scsi/aha152x.txt. 308 See Documentation/scsi/aha152x.txt.
292 309
@@ -334,7 +351,7 @@ and is between 256 and 4096 characters. It is defined in the file
334 not play well with APC CPU idle - disable it if you have 351 not play well with APC CPU idle - disable it if you have
335 APC and your system crashes randomly. 352 APC and your system crashes randomly.
336 353
337 apic= [APIC,i386] Advanced Programmable Interrupt Controller 354 apic= [APIC,X86-32] Advanced Programmable Interrupt Controller
338 Change the output verbosity whilst booting 355 Change the output verbosity whilst booting
339 Format: { quiet (default) | verbose | debug } 356 Format: { quiet (default) | verbose | debug }
340 Change the amount of debugging information output 357 Change the amount of debugging information output
@@ -414,12 +431,6 @@ and is between 256 and 4096 characters. It is defined in the file
414 possible to determine what the correct size should be. 431 possible to determine what the correct size should be.
415 This option provides an override for these situations. 432 This option provides an override for these situations.
416 433
417 security= [SECURITY] Choose a security module to enable at boot.
418 If this boot parameter is not specified, only the first
419 security module asking for security registration will be
420 loaded. An invalid security module name will be treated
421 as if no module has been chosen.
422
423 capability.disable= 434 capability.disable=
424 [SECURITY] Disable capabilities. This would normally 435 [SECURITY] Disable capabilities. This would normally
425 be used only if an alternative security model is to be 436 be used only if an alternative security model is to be
@@ -486,17 +497,11 @@ and is between 256 and 4096 characters. It is defined in the file
486 Also note the kernel might malfunction if you disable 497 Also note the kernel might malfunction if you disable
487 some critical bits. 498 some critical bits.
488 499
489 code_bytes [IA32/X86_64] How many bytes of object code to print 500 code_bytes [X86] How many bytes of object code to print
490 in an oops report. 501 in an oops report.
491 Range: 0 - 8192 502 Range: 0 - 8192
492 Default: 64 503 Default: 64
493 504
494 hpet= [X86-32,HPET] option to control HPET usage
495 Format: { enable (default) | disable | force }
496 disable: disable HPET and use PIT instead
497 force: allow force enabled of undocumented chips (ICH4,
498 VIA, nVidia)
499
500 com20020= [HW,NET] ARCnet - COM20020 chipset 505 com20020= [HW,NET] ARCnet - COM20020 chipset
501 Format: 506 Format:
502 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]] 507 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -540,23 +545,6 @@ and is between 256 and 4096 characters. It is defined in the file
540 console=brl,ttyS0 545 console=brl,ttyS0
541 For now, only VisioBraille is supported. 546 For now, only VisioBraille is supported.
542 547
543 earlycon= [KNL] Output early console device and options.
544 uart[8250],io,<addr>[,options]
545 uart[8250],mmio,<addr>[,options]
546 Start an early, polled-mode console on the 8250/16550
547 UART at the specified I/O port or MMIO address.
548 The options are the same as for ttyS, above.
549
550 no_console_suspend
551 [HW] Never suspend the console
552 Disable suspending of consoles during suspend and
553 hibernate operations. Once disabled, debugging
554 messages can reach various consoles while the rest
555 of the system is being put to sleep (ie, while
556 debugging driver suspend/resume hooks). This may
557 not work reliably with all consoles, but is known
558 to work with serial and VGA consoles.
559
560 coredump_filter= 548 coredump_filter=
561 [KNL] Change the default value for 549 [KNL] Change the default value for
562 /proc/<pid>/coredump_filter. 550 /proc/<pid>/coredump_filter.
@@ -604,36 +592,22 @@ and is between 256 and 4096 characters. It is defined in the file
604 592
605 debug_objects [KNL] Enable object debugging 593 debug_objects [KNL] Enable object debugging
606 594
595 no_debug_objects
596 [KNL] Disable object debugging
597
607 debugpat [X86] Enable PAT debugging 598 debugpat [X86] Enable PAT debugging
608 599
609 decnet.addr= [HW,NET] 600 decnet.addr= [HW,NET]
610 Format: <area>[,<node>] 601 Format: <area>[,<node>]
611 See also Documentation/networking/decnet.txt. 602 See also Documentation/networking/decnet.txt.
612 603
613 vt.default_blu= [VT] 604 default_hugepagesz=
614 Format: <blue0>,<blue1>,<blue2>,...,<blue15> 605 [same as hugepagesz=] The size of the default
615 Change the default blue palette of the console. 606 HugeTLB page size. This is the size represented by
616 This is a 16-member array composed of values 607 the legacy /proc/ hugepages APIs, used for SHM, and
617 ranging from 0-255. 608 default size when mounting hugetlbfs filesystems.
618 609 Defaults to the default architecture's huge page size
619 vt.default_grn= [VT] 610 if not specified.
620 Format: <green0>,<green1>,<green2>,...,<green15>
621 Change the default green palette of the console.
622 This is a 16-member array composed of values
623 ranging from 0-255.
624
625 vt.default_red= [VT]
626 Format: <red0>,<red1>,<red2>,...,<red15>
627 Change the default red palette of the console.
628 This is a 16-member array composed of values
629 ranging from 0-255.
630
631 vt.default_utf8=
632 [VT]
633 Format=<0|1>
634 Set system-wide default UTF-8 mode for all tty's.
635 Default is 1, i.e. UTF-8 mode is enabled for all
636 newly opened terminals.
637 611
638 dhash_entries= [KNL] 612 dhash_entries= [KNL]
639 Set number of hash buckets for dentry cache. 613 Set number of hash buckets for dentry cache.
@@ -646,27 +620,9 @@ and is between 256 and 4096 characters. It is defined in the file
646 Documentation/serial/digiepca.txt. 620 Documentation/serial/digiepca.txt.
647 621
648 disable_mtrr_cleanup [X86] 622 disable_mtrr_cleanup [X86]
649 enable_mtrr_cleanup [X86]
650 The kernel tries to adjust MTRR layout from continuous 623 The kernel tries to adjust MTRR layout from continuous
651 to discrete, to make X server driver able to add WB 624 to discrete, to make X server driver able to add WB
652 entry later. This parameter enables/disables that. 625 entry later. This parameter disables that.
653
654 mtrr_chunk_size=nn[KMG] [X86]
655 used for mtrr cleanup. It is largest continous chunk
656 that could hold holes aka. UC entries.
657
658 mtrr_gran_size=nn[KMG] [X86]
659 Used for mtrr cleanup. It is granularity of mtrr block.
660 Default is 1.
661 Large value could prevent small alignment from
662 using up MTRRs.
663
664 mtrr_spare_reg_nr=n [X86]
665 Format: <integer>
666 Range: 0,7 : spare reg number
667 Default : 1
668 Used for mtrr cleanup. It is spare mtrr entries number.
669 Set to 2 or more if your graphical card needs more.
670 626
671 disable_mtrr_trim [X86, Intel and AMD only] 627 disable_mtrr_trim [X86, Intel and AMD only]
672 By default the kernel will trim any uncacheable 628 By default the kernel will trim any uncacheable
@@ -674,13 +630,39 @@ and is between 256 and 4096 characters. It is defined in the file
674 MTRR settings. This parameter disables that behavior, 630 MTRR settings. This parameter disables that behavior,
675 possibly causing your machine to run very slowly. 631 possibly causing your machine to run very slowly.
676 632
633 disable_timer_pin_1 [X86]
634 Disable PIN 1 of APIC timer
635 Can be useful to work around chipset bugs.
636
677 dmasound= [HW,OSS] Sound subsystem buffers 637 dmasound= [HW,OSS] Sound subsystem buffers
678 638
639 dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
640 this option disables the debugging code at boot.
641
642 dma_debug_entries=<number>
643 This option allows to tune the number of preallocated
644 entries for DMA-API debugging code. One entry is
645 required per DMA-API allocation. Use this if the
646 DMA-API debugging code disables itself because the
647 architectural default is too low.
648
679 dscc4.setup= [NET] 649 dscc4.setup= [NET]
680 650
681 dtc3181e= [HW,SCSI] 651 dtc3181e= [HW,SCSI]
682 652
683 earlyprintk= [X86-32,X86-64,SH,BLACKFIN] 653 dynamic_printk Enables pr_debug()/dev_dbg() calls if
654 CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
655 These can also be switched on/off via
656 <debugfs>/dynamic_printk/modules
657
658 earlycon= [KNL] Output early console device and options.
659 uart[8250],io,<addr>[,options]
660 uart[8250],mmio,<addr>[,options]
661 Start an early, polled-mode console on the 8250/16550
662 UART at the specified I/O port or MMIO address.
663 The options are the same as for ttyS, above.
664
665 earlyprintk= [X86,SH,BLACKFIN]
684 earlyprintk=vga 666 earlyprintk=vga
685 earlyprintk=serial[,ttySn[,baudrate]] 667 earlyprintk=serial[,ttySn[,baudrate]]
686 earlyprintk=dbgp 668 earlyprintk=dbgp
@@ -715,12 +697,23 @@ and is between 256 and 4096 characters. It is defined in the file
715 See Documentation/block/as-iosched.txt and 697 See Documentation/block/as-iosched.txt and
716 Documentation/block/deadline-iosched.txt for details. 698 Documentation/block/deadline-iosched.txt for details.
717 699
718 elfcorehdr= [IA64,PPC,SH,X86-32,X86_64] 700 elfcorehdr= [IA64,PPC,SH,X86]
719 Specifies physical address of start of kernel core 701 Specifies physical address of start of kernel core
720 image elf header. Generally kexec loader will 702 image elf header. Generally kexec loader will
721 pass this option to capture kernel. 703 pass this option to capture kernel.
722 See Documentation/kdump/kdump.txt for details. 704 See Documentation/kdump/kdump.txt for details.
723 705
706 enable_mtrr_cleanup [X86]
707 The kernel tries to adjust MTRR layout from continuous
708 to discrete, to make X server driver able to add WB
709 entry later. This parameter enables that.
710
711 enable_timer_pin_1 [X86]
712 Enable PIN 1 of APIC timer
713 Can be useful to work around chipset bugs
714 (in particular on some ATI chipsets).
715 The kernel tries to set a reasonable default.
716
724 enforcing [SELINUX] Set initial enforcing status. 717 enforcing [SELINUX] Set initial enforcing status.
725 Format: {"0" | "1"} 718 Format: {"0" | "1"}
726 See security/selinux/Kconfig help text. 719 See security/selinux/Kconfig help text.
@@ -788,7 +781,7 @@ and is between 256 and 4096 characters. It is defined in the file
788 781
789 hashdist= [KNL,NUMA] Large hashes allocated during boot 782 hashdist= [KNL,NUMA] Large hashes allocated during boot
790 are distributed across NUMA nodes. Defaults on 783 are distributed across NUMA nodes. Defaults on
791 for IA-64, off otherwise. 784 for 64bit NUMA, off otherwise.
792 Format: 0 | 1 (for off | on) 785 Format: 0 | 1 (for off | on)
793 786
794 hcl= [IA-64] SGI's Hardware Graph compatibility layer 787 hcl= [IA-64] SGI's Hardware Graph compatibility layer
@@ -808,6 +801,16 @@ and is between 256 and 4096 characters. It is defined in the file
808 hisax= [HW,ISDN] 801 hisax= [HW,ISDN]
809 See Documentation/isdn/README.HiSax. 802 See Documentation/isdn/README.HiSax.
810 803
804 hlt [BUGS=ARM,SH]
805
806 hpet= [X86-32,HPET] option to control HPET usage
807 Format: { enable (default) | disable | force |
808 verbose }
809 disable: disable HPET and use PIT instead
810 force: allow force enabled of undocumented chips (ICH4,
811 VIA, nVidia)
812 verbose: show contents of HPET registers during setup
813
811 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. 814 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
812 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. 815 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
813 On x86-64 and powerpc, this option can be specified 816 On x86-64 and powerpc, this option can be specified
@@ -817,18 +820,18 @@ and is between 256 and 4096 characters. It is defined in the file
817 (when the CPU supports the "pdpe1gb" cpuinfo flag) 820 (when the CPU supports the "pdpe1gb" cpuinfo flag)
818 Note that 1GB pages can only be allocated at boot time 821 Note that 1GB pages can only be allocated at boot time
819 using hugepages= and not freed afterwards. 822 using hugepages= and not freed afterwards.
820 default_hugepagesz=
821 [same as hugepagesz=] The size of the default
822 HugeTLB page size. This is the size represented by
823 the legacy /proc/ hugepages APIs, used for SHM, and
824 default size when mounting hugetlbfs filesystems.
825 Defaults to the default architecture's huge page size
826 if not specified.
827
828 hlt [BUGS=ARM,SH]
829 823
830 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) 824 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
831 terminal devices. Valid values: 0..8 825 terminal devices. Valid values: 0..8
826 hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
827 If specified, z/VM IUCV HVC accepts connections
828 from listed z/VM user IDs only.
829
830 i2c_bus= [HW] Override the default board specific I2C bus speed
831 or register an additional I2C bus that is not
832 registered from board initialization code.
833 Format:
834 <bus_id>,<clkrate>
832 835
833 i8042.debug [HW] Toggle i8042 debug mode 836 i8042.debug [HW] Toggle i8042 debug mode
834 i8042.direct [HW] Put keyboard port into non-translated mode 837 i8042.direct [HW] Put keyboard port into non-translated mode
@@ -877,6 +880,9 @@ and is between 256 and 4096 characters. It is defined in the file
877 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed 880 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
878 See Documentation/ide/ide.txt. 881 See Documentation/ide/ide.txt.
879 882
883 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
884 Claim all unknown PCI IDE storage controllers.
885
880 idle= [X86] 886 idle= [X86]
881 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait 887 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait
882 Poll forces a polling idle loop that can slightly 888 Poll forces a polling idle loop that can slightly
@@ -892,9 +898,6 @@ and is between 256 and 4096 characters. It is defined in the file
892 In such case C2/C3 won't be used again. 898 In such case C2/C3 won't be used again.
893 idle=nomwait: Disable mwait for CPU C-states 899 idle=nomwait: Disable mwait for CPU C-states
894 900
895 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
896 Claim all unknown PCI IDE storage controllers.
897
898 ignore_loglevel [KNL] 901 ignore_loglevel [KNL]
899 Ignore loglevel setting - this will print /all/ 902 Ignore loglevel setting - this will print /all/
900 kernel messages to the console. Useful for debugging. 903 kernel messages to the console. Useful for debugging.
@@ -902,6 +905,15 @@ and is between 256 and 4096 characters. It is defined in the file
902 ihash_entries= [KNL] 905 ihash_entries= [KNL]
903 Set number of hash buckets for inode cache. 906 Set number of hash buckets for inode cache.
904 907
908 ima_audit= [IMA]
909 Format: { "0" | "1" }
910 0 -- integrity auditing messages. (Default)
911 1 -- enable informational integrity auditing messages.
912
913 ima_hash= [IMA]
914 Formt: { "sha1" | "md5" }
915 default: "sha1"
916
905 in2000= [HW,SCSI] 917 in2000= [HW,SCSI]
906 See header of drivers/scsi/in2000.c. 918 See header of drivers/scsi/in2000.c.
907 919
@@ -919,25 +931,6 @@ and is between 256 and 4096 characters. It is defined in the file
919 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver 931 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
920 Format: <irq> 932 Format: <irq>
921 933
922 inttest= [IA64]
923
924 iomem= Disable strict checking of access to MMIO memory
925 strict regions from userspace.
926 relaxed
927
928 iommu= [x86]
929 off
930 force
931 noforce
932 biomerge
933 panic
934 nopanic
935 merge
936 nomerge
937 forcesac
938 soft
939
940
941 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option 934 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
942 on 935 on
943 Enable intel iommu driver. 936 Enable intel iommu driver.
@@ -961,7 +954,29 @@ and is between 256 and 4096 characters. It is defined in the file
961 result in a hardware IOTLB flush operation as opposed 954 result in a hardware IOTLB flush operation as opposed
962 to batching them for performance. 955 to batching them for performance.
963 956
964 io_delay= [X86-32,X86-64] I/O delay method 957 inttest= [IA64]
958
959 iomem= Disable strict checking of access to MMIO memory
960 strict regions from userspace.
961 relaxed
962
963 iommu= [x86]
964 off
965 force
966 noforce
967 biomerge
968 panic
969 nopanic
970 merge
971 nomerge
972 forcesac
973 soft
974
975 io7= [HW] IO7 for Marvel based alpha systems
976 See comment before marvel_specify_io7 in
977 arch/alpha/kernel/core_marvel.c.
978
979 io_delay= [X86] I/O delay method
965 0x80 980 0x80
966 Standard port 0x80 based delay 981 Standard port 0x80 based delay
967 0xed 982 0xed
@@ -971,10 +986,6 @@ and is between 256 and 4096 characters. It is defined in the file
971 none 986 none
972 No delay 987 No delay
973 988
974 io7= [HW] IO7 for Marvel based alpha systems
975 See comment before marvel_specify_io7 in
976 arch/alpha/kernel/core_marvel.c.
977
978 ip= [IP_PNP] 989 ip= [IP_PNP]
979 See Documentation/filesystems/nfsroot.txt. 990 See Documentation/filesystems/nfsroot.txt.
980 991
@@ -985,12 +996,6 @@ and is between 256 and 4096 characters. It is defined in the file
985 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller 996 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
986 See header of drivers/scsi/ips.c. 997 See header of drivers/scsi/ips.c.
987 998
988 ports= [IP_VS_FTP] IPVS ftp helper module
989 Default is 21.
990 Up to 8 (IP_VS_APP_MAX_PORTS) ports
991 may be specified.
992 Format: <port>,<port>....
993
994 irqfixup [HW] 999 irqfixup [HW]
995 When an interrupt is not handled search all handlers 1000 When an interrupt is not handled search all handlers
996 for it. Intended to get systems with badly broken 1001 for it. Intended to get systems with badly broken
@@ -1031,7 +1036,9 @@ and is between 256 and 4096 characters. It is defined in the file
1031 js= [HW,JOY] Analog joystick 1036 js= [HW,JOY] Analog joystick
1032 See Documentation/input/joystick.txt. 1037 See Documentation/input/joystick.txt.
1033 1038
1034 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1039 keepinitrd [HW,ARM]
1040
1041 kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
1035 specifies the amount of memory usable by the kernel 1042 specifies the amount of memory usable by the kernel
1036 for non-movable allocations. The requested amount is 1043 for non-movable allocations. The requested amount is
1037 spread evenly throughout all nodes in the system. The 1044 spread evenly throughout all nodes in the system. The
@@ -1047,20 +1054,14 @@ and is between 256 and 4096 characters. It is defined in the file
1047 use the HighMem zone if it exists, and the Normal 1054 use the HighMem zone if it exists, and the Normal
1048 zone if it does not. 1055 zone if it does not.
1049 1056
1050 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1057 kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
1051 is similar to kernelcore except it specifies the 1058 Controls whether kmemtrace is enabled
1052 amount of memory used for migratable allocations. 1059 at boot-time.
1053 If both kernelcore and movablecore is specified,
1054 then kernelcore will be at *least* the specified
1055 value but may be more. If movablecore on its own
1056 is specified, the administrator must be careful
1057 that the amount of memory usable for all allocations
1058 is not too small.
1059 1060
1060 keepinitrd [HW,ARM] 1061 kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
1061 1062 subbufs kmemtrace's relay channel has. Set this
1062 kstack=N [X86-32,X86-64] Print N words from the kernel stack 1063 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1063 in oops dumps. 1064 you experience buffer overruns.
1064 1065
1065 kgdboc= [HW] kgdb over consoles. 1066 kgdboc= [HW] kgdb over consoles.
1066 Requires a tty driver that supports console polling. 1067 Requires a tty driver that supports console polling.
@@ -1071,6 +1072,9 @@ and is between 256 and 4096 characters. It is defined in the file
1071 Configure the RouterBoard 532 series on-chip 1072 Configure the RouterBoard 532 series on-chip
1072 Ethernet adapter MAC address. 1073 Ethernet adapter MAC address.
1073 1074
1075 kstack=N [X86] Print N words from the kernel stack
1076 in oops dumps.
1077
1074 l2cr= [PPC] 1078 l2cr= [PPC]
1075 1079
1076 l3cr= [PPC] 1080 l3cr= [PPC]
@@ -1078,7 +1082,7 @@ and is between 256 and 4096 characters. It is defined in the file
1078 lapic [X86-32,APIC] Enable the local APIC even if BIOS 1082 lapic [X86-32,APIC] Enable the local APIC even if BIOS
1079 disabled it. 1083 disabled it.
1080 1084
1081 lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer 1085 lapic_timer_c2_ok [X86,APIC] trust the local apic timer
1082 in C2 power state. 1086 in C2 power state.
1083 1087
1084 libata.dma= [LIBATA] DMA control 1088 libata.dma= [LIBATA] DMA control
@@ -1216,9 +1220,8 @@ and is between 256 and 4096 characters. It is defined in the file
1216 (machvec) in a generic kernel. 1220 (machvec) in a generic kernel.
1217 Example: machvec=hpzx1_swiotlb 1221 Example: machvec=hpzx1_swiotlb
1218 1222
1219 max_loop= [LOOP] Maximum number of loopback devices that can 1223 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
1220 be mounted 1224 than or equal to this physical address is ignored.
1221 Format: <1-256>
1222 1225
1223 maxcpus= [SMP] Maximum number of processors that an SMP kernel 1226 maxcpus= [SMP] Maximum number of processors that an SMP kernel
1224 should make use of. maxcpus=n : n >= 0 limits the 1227 should make use of. maxcpus=n : n >= 0 limits the
@@ -1226,8 +1229,9 @@ and is between 256 and 4096 characters. It is defined in the file
1226 it is equivalent to "nosmp", which also disables 1229 it is equivalent to "nosmp", which also disables
1227 the IO APIC. 1230 the IO APIC.
1228 1231
1229 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than 1232 max_loop= [LOOP] Maximum number of loopback devices that can
1230 or equal to this physical address is ignored. 1233 be mounted
1234 Format: <1-256>
1231 1235
1232 max_luns= [SCSI] Maximum number of LUNs to probe. 1236 max_luns= [SCSI] Maximum number of LUNs to probe.
1233 Should be between 1 and 2^32-1. 1237 Should be between 1 and 2^32-1.
@@ -1263,7 +1267,7 @@ and is between 256 and 4096 characters. It is defined in the file
1263 [KNL,SH] Allow user to override the default size for 1267 [KNL,SH] Allow user to override the default size for
1264 per-device physically contiguous DMA buffers. 1268 per-device physically contiguous DMA buffers.
1265 1269
1266 memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact 1270 memmap=exactmap [KNL,X86] Enable setting of an exact
1267 E820 memory map, as specified by the user. 1271 E820 memory map, as specified by the user.
1268 Such memmap=exactmap lines can be constructed based on 1272 Such memmap=exactmap lines can be constructed based on
1269 BIOS output or other requirements. See the memmap=nn@ss 1273 BIOS output or other requirements. See the memmap=nn@ss
@@ -1310,8 +1314,13 @@ and is between 256 and 4096 characters. It is defined in the file
1310 1314
1311 memtest= [KNL,X86] Enable memtest 1315 memtest= [KNL,X86] Enable memtest
1312 Format: <integer> 1316 Format: <integer>
1313 range: 0,4 : pattern number
1314 default : 0 <disable> 1317 default : 0 <disable>
1318 Specifies the number of memtest passes to be
1319 performed. Each pass selects another test
1320 pattern from a given set of patterns. Memtest
1321 fills the memory with this pattern, validates
1322 memory contents and reserves bad memory
1323 regions that are detected.
1315 1324
1316 meye.*= [HW] Set MotionEye Camera parameters 1325 meye.*= [HW] Set MotionEye Camera parameters
1317 See Documentation/video4linux/meye.txt. 1326 See Documentation/video4linux/meye.txt.
@@ -1349,6 +1358,16 @@ and is between 256 and 4096 characters. It is defined in the file
1349 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices 1358 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
1350 reporting absolute coordinates, such as tablets 1359 reporting absolute coordinates, such as tablets
1351 1360
1361 movablecore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
1362 is similar to kernelcore except it specifies the
1363 amount of memory used for migratable allocations.
1364 If both kernelcore and movablecore is specified,
1365 then kernelcore will be at *least* the specified
1366 value but may be more. If movablecore on its own
1367 is specified, the administrator must be careful
1368 that the amount of memory usable for all allocations
1369 is not too small.
1370
1352 mpu401= [HW,OSS] 1371 mpu401= [HW,OSS]
1353 Format: <io>,<irq> 1372 Format: <io>,<irq>
1354 1373
@@ -1370,6 +1389,23 @@ and is between 256 and 4096 characters. It is defined in the file
1370 [HW] Make the MicroTouch USB driver use raw coordinates 1389 [HW] Make the MicroTouch USB driver use raw coordinates
1371 ('y', default) or cooked coordinates ('n') 1390 ('y', default) or cooked coordinates ('n')
1372 1391
1392 mtrr_chunk_size=nn[KMG] [X86]
1393 used for mtrr cleanup. It is largest continous chunk
1394 that could hold holes aka. UC entries.
1395
1396 mtrr_gran_size=nn[KMG] [X86]
1397 Used for mtrr cleanup. It is granularity of mtrr block.
1398 Default is 1.
1399 Large value could prevent small alignment from
1400 using up MTRRs.
1401
1402 mtrr_spare_reg_nr=n [X86]
1403 Format: <integer>
1404 Range: 0,7 : spare reg number
1405 Default : 1
1406 Used for mtrr cleanup. It is spare mtrr entries number.
1407 Set to 2 or more if your graphical card needs more.
1408
1373 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card 1409 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
1374 1410
1375 NCR_D700= [HW,SCSI] 1411 NCR_D700= [HW,SCSI]
@@ -1424,17 +1460,19 @@ and is between 256 and 4096 characters. It is defined in the file
1424 when a NMI is triggered. 1460 when a NMI is triggered.
1425 Format: [state][,regs][,debounce][,die] 1461 Format: [state][,regs][,debounce][,die]
1426 1462
1427 nmi_watchdog= [KNL,BUGS=X86-32,X86-64] Debugging features for SMP kernels 1463 nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels
1428 Format: [panic,][num] 1464 Format: [panic,][num]
1429 Valid num: 0,1,2 1465 Valid num: 0,1,2
1430 0 - turn nmi_watchdog off 1466 0 - turn nmi_watchdog off
1431 1 - use the IO-APIC timer for the NMI watchdog 1467 1 - use the IO-APIC timer for the NMI watchdog
1432 2 - use the local APIC for the NMI watchdog using 1468 2 - use the local APIC for the NMI watchdog using
1433 a performance counter. Note: This will use one performance 1469 a performance counter. Note: This will use one
1434 counter and the local APIC's performance vector. 1470 performance counter and the local APIC's performance
1435 When panic is specified panic when an NMI watchdog timeout occurs. 1471 vector.
1436 This is useful when you use a panic=... timeout and need the box 1472 When panic is specified, panic when an NMI watchdog
1437 quickly up again. 1473 timeout occurs.
1474 This is useful when you use a panic=... timeout and
1475 need the box quickly up again.
1438 Instead of 1 and 2 it is possible to use the following 1476 Instead of 1 and 2 it is possible to use the following
1439 symbolic names: lapic and ioapic 1477 symbolic names: lapic and ioapic
1440 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic 1478 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
@@ -1443,6 +1481,16 @@ and is between 256 and 4096 characters. It is defined in the file
1443 emulation library even if a 387 maths coprocessor 1481 emulation library even if a 387 maths coprocessor
1444 is present. 1482 is present.
1445 1483
1484 no_console_suspend
1485 [HW] Never suspend the console
1486 Disable suspending of consoles during suspend and
1487 hibernate operations. Once disabled, debugging
1488 messages can reach various consoles while the rest
1489 of the system is being put to sleep (ie, while
1490 debugging driver suspend/resume hooks). This may
1491 not work reliably with all consoles, but is known
1492 to work with serial and VGA consoles.
1493
1446 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien 1494 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
1447 caches in the slab allocator. Saves per-node memory, 1495 caches in the slab allocator. Saves per-node memory,
1448 but will impact performance. 1496 but will impact performance.
@@ -1457,17 +1505,19 @@ and is between 256 and 4096 characters. It is defined in the file
1457 1505
1458 nocache [ARM] 1506 nocache [ARM]
1459 1507
1508 noclflush [BUGS=X86] Don't use the CLFLUSH instruction
1509
1460 nodelayacct [KNL] Disable per-task delay accounting 1510 nodelayacct [KNL] Disable per-task delay accounting
1461 1511
1462 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. 1512 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
1463 1513
1464 nodsp [SH] Disable hardware DSP at boot time. 1514 nodsp [SH] Disable hardware DSP at boot time.
1465 1515
1466 noefi [X86-32,X86-64] Disable EFI runtime services support. 1516 noefi [X86] Disable EFI runtime services support.
1467 1517
1468 noexec [IA-64] 1518 noexec [IA-64]
1469 1519
1470 noexec [X86-32,X86-64] 1520 noexec [X86]
1471 On X86-32 available only on PAE configured kernels. 1521 On X86-32 available only on PAE configured kernels.
1472 noexec=on: enable non-executable mappings (default) 1522 noexec=on: enable non-executable mappings (default)
1473 noexec=off: disable non-executable mappings 1523 noexec=off: disable non-executable mappings
@@ -1485,9 +1535,9 @@ and is between 256 and 4096 characters. It is defined in the file
1485 register save and restore. The kernel will only save 1535 register save and restore. The kernel will only save
1486 legacy floating-point registers on task switch. 1536 legacy floating-point registers on task switch.
1487 1537
1488 noclflush [BUGS=X86] Don't use the CLFLUSH instruction 1538 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
1489 1539 wfi(ARM) instruction doesn't work correctly and not to
1490 nohlt [BUGS=ARM,SH] 1540 use it. This is also useful when using JTAG debugger.
1491 1541
1492 no-hlt [BUGS=X86-32] Tells the kernel that the hlt 1542 no-hlt [BUGS=X86-32] Tells the kernel that the hlt
1493 instruction doesn't work correctly and not to 1543 instruction doesn't work correctly and not to
@@ -1508,10 +1558,12 @@ and is between 256 and 4096 characters. It is defined in the file
1508 Valid arguments: on, off 1558 Valid arguments: on, off
1509 Default: on 1559 Default: on
1510 1560
1561 noiotrap [SH] Disables trapped I/O port accesses.
1562
1511 noirqdebug [X86-32] Disables the code which attempts to detect and 1563 noirqdebug [X86-32] Disables the code which attempts to detect and
1512 disable unhandled interrupt sources. 1564 disable unhandled interrupt sources.
1513 1565
1514 no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for 1566 no_timer_check [X86,APIC] Disables the code which tests for
1515 broken timer IRQ sources. 1567 broken timer IRQ sources.
1516 1568
1517 noisapnp [ISAPNP] Disables ISA PnP code. 1569 noisapnp [ISAPNP] Disables ISA PnP code.
@@ -1527,12 +1579,6 @@ and is between 256 and 4096 characters. It is defined in the file
1527 1579
1528 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1580 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
1529 1581
1530 nox2apic [X86-64,APIC] Do not enable x2APIC mode.
1531
1532 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
1533 default x2apic cluster mode on platforms
1534 supporting x2apic.
1535
1536 noltlbs [PPC] Do not use large page/tlb entries for kernel 1582 noltlbs [PPC] Do not use large page/tlb entries for kernel
1537 lowmem mapping on PPC40x. 1583 lowmem mapping on PPC40x.
1538 1584
@@ -1543,6 +1589,9 @@ and is between 256 and 4096 characters. It is defined in the file
1543 nomfgpt [X86-32] Disable Multi-Function General Purpose 1589 nomfgpt [X86-32] Disable Multi-Function General Purpose
1544 Timer usage (for AMD Geode machines). 1590 Timer usage (for AMD Geode machines).
1545 1591
1592 norandmaps Don't use address space randomization. Equivalent to
1593 echo 0 > /proc/sys/kernel/randomize_va_space
1594
1546 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops 1595 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
1547 1596
1548 noreplace-smp [X86-32,SMP] Don't replace SMP instructions 1597 noreplace-smp [X86-32,SMP] Don't replace SMP instructions
@@ -1567,7 +1616,7 @@ and is between 256 and 4096 characters. It is defined in the file
1567 nosoftlockup [KNL] Disable the soft-lockup detector. 1616 nosoftlockup [KNL] Disable the soft-lockup detector.
1568 1617
1569 noswapaccount [KNL] Disable accounting of swap in memory resource 1618 noswapaccount [KNL] Disable accounting of swap in memory resource
1570 controller. (See Documentation/controllers/memory.txt) 1619 controller. (See Documentation/cgroups/memory.txt)
1571 1620
1572 nosync [HW,M68K] Disables sync negotiation for all devices. 1621 nosync [HW,M68K] Disables sync negotiation for all devices.
1573 1622
@@ -1577,17 +1626,19 @@ and is between 256 and 4096 characters. It is defined in the file
1577 1626
1578 nowb [ARM] 1627 nowb [ARM]
1579 1628
1629 nox2apic [X86-64,APIC] Do not enable x2APIC mode.
1630
1580 nptcg= [IA64] Override max number of concurrent global TLB 1631 nptcg= [IA64] Override max number of concurrent global TLB
1581 purges which is reported from either PAL_VM_SUMMARY or 1632 purges which is reported from either PAL_VM_SUMMARY or
1582 SAL PALO. 1633 SAL PALO.
1583 1634
1635 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1636
1584 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 1637 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
1585 one of ['zone', 'node', 'default'] can be specified 1638 one of ['zone', 'node', 'default'] can be specified
1586 This can be set from sysctl after boot. 1639 This can be set from sysctl after boot.
1587 See Documentation/sysctl/vm.txt for details. 1640 See Documentation/sysctl/vm.txt for details.
1588 1641
1589 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1590
1591 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. 1642 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
1592 See Documentation/debugging-via-ohci1394.txt for more 1643 See Documentation/debugging-via-ohci1394.txt for more
1593 info. 1644 info.
@@ -1659,6 +1710,8 @@ and is between 256 and 4096 characters. It is defined in the file
1659 See also Documentation/blockdev/paride.txt. 1710 See also Documentation/blockdev/paride.txt.
1660 1711
1661 pci=option[,option...] [PCI] various PCI subsystem options: 1712 pci=option[,option...] [PCI] various PCI subsystem options:
1713 earlydump [X86] dump PCI config space before the kernel
1714 changes anything
1662 off [X86] don't probe for the PCI bus 1715 off [X86] don't probe for the PCI bus
1663 bios [X86-32] force use of PCI BIOS, don't access 1716 bios [X86-32] force use of PCI BIOS, don't access
1664 the hardware directly. Use this if your machine 1717 the hardware directly. Use this if your machine
@@ -1676,7 +1729,7 @@ and is between 256 and 4096 characters. It is defined in the file
1676 disable the use of PCIE advanced error reporting. 1729 disable the use of PCIE advanced error reporting.
1677 nodomains [PCI] Disable support for multiple PCI 1730 nodomains [PCI] Disable support for multiple PCI
1678 root domains (aka PCI segments, in ACPI-speak). 1731 root domains (aka PCI segments, in ACPI-speak).
1679 nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI 1732 nommconf [X86] Disable use of MMCONFIG for PCI
1680 Configuration 1733 Configuration
1681 nomsi [MSI] If the PCI_MSI kernel config parameter is 1734 nomsi [MSI] If the PCI_MSI kernel config parameter is
1682 enabled, this kernel boot option can be used to 1735 enabled, this kernel boot option can be used to
@@ -1758,6 +1811,15 @@ and is between 256 and 4096 characters. It is defined in the file
1758 cbmemsize=nn[KMG] The fixed amount of bus space which is 1811 cbmemsize=nn[KMG] The fixed amount of bus space which is
1759 reserved for the CardBus bridge's memory 1812 reserved for the CardBus bridge's memory
1760 window. The default value is 64 megabytes. 1813 window. The default value is 64 megabytes.
1814 resource_alignment=
1815 Format:
1816 [<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
1817 Specifies alignment and device to reassign
1818 aligned memory resources.
1819 If <order of align> is not specified,
1820 PAGE_SIZE is used as alignment.
1821 PCI-PCI bridge can be specified, if resource
1822 windows need to be expanded.
1761 1823
1762 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power 1824 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
1763 Management. 1825 Management.
@@ -1816,10 +1878,11 @@ and is between 256 and 4096 characters. It is defined in the file
1816 autoconfiguration. 1878 autoconfiguration.
1817 Ranges are in pairs (memory base and size). 1879 Ranges are in pairs (memory base and size).
1818 1880
1819 dynamic_printk Enables pr_debug()/dev_dbg() calls if 1881 ports= [IP_VS_FTP] IPVS ftp helper module
1820 CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. 1882 Default is 21.
1821 These can also be switched on/off via 1883 Up to 8 (IP_VS_APP_MAX_PORTS) ports
1822 <debugfs>/dynamic_printk/modules 1884 may be specified.
1885 Format: <port>,<port>....
1823 1886
1824 print-fatal-signals= 1887 print-fatal-signals=
1825 [KNL] debug: print fatal signals 1888 [KNL] debug: print fatal signals
@@ -1830,6 +1893,14 @@ and is between 256 and 4096 characters. It is defined in the file
1830 printk.time= Show timing data prefixed to each printk message line 1893 printk.time= Show timing data prefixed to each printk message line
1831 Format: <bool> (1/Y/y=enable, 0/N/n=disable) 1894 Format: <bool> (1/Y/y=enable, 0/N/n=disable)
1832 1895
1896 processor.max_cstate= [HW,ACPI]
1897 Limit processor to maximum C-state
1898 max_cstate=9 overrides any DMI blacklist limit.
1899
1900 processor.nocst [HW,ACPI]
1901 Ignore the _CST method to determine C-states,
1902 instead using the legacy FADT method
1903
1833 profile= [KNL] Enable kernel profiling via /proc/profile 1904 profile= [KNL] Enable kernel profiling via /proc/profile
1834 Format: [schedule,]<number> 1905 Format: [schedule,]<number>
1835 Param: "schedule" - profile schedule points. 1906 Param: "schedule" - profile schedule points.
@@ -1839,14 +1910,6 @@ and is between 256 and 4096 characters. It is defined in the file
1839 Requires CONFIG_SCHEDSTATS 1910 Requires CONFIG_SCHEDSTATS
1840 Param: "kvm" - profile VM exits. 1911 Param: "kvm" - profile VM exits.
1841 1912
1842 processor.max_cstate= [HW,ACPI]
1843 Limit processor to maximum C-state
1844 max_cstate=9 overrides any DMI blacklist limit.
1845
1846 processor.nocst [HW,ACPI]
1847 Ignore the _CST method to determine C-states,
1848 instead using the legacy FADT method
1849
1850 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk 1913 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
1851 before loading. 1914 before loading.
1852 See Documentation/blockdev/ramdisk.txt. 1915 See Documentation/blockdev/ramdisk.txt.
@@ -1911,7 +1974,7 @@ and is between 256 and 4096 characters. It is defined in the file
1911 1974
1912 relax_domain_level= 1975 relax_domain_level=
1913 [KNL, SMP] Set scheduler's default relax_domain_level. 1976 [KNL, SMP] Set scheduler's default relax_domain_level.
1914 See Documentation/cpusets.txt. 1977 See Documentation/cgroups/cpusets.txt.
1915 1978
1916 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 1979 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
1917 1980
@@ -2000,7 +2063,13 @@ and is between 256 and 4096 characters. It is defined in the file
2000 allowing boot to proceed. none ignores them, expecting 2063 allowing boot to proceed. none ignores them, expecting
2001 user space to do the scan. 2064 user space to do the scan.
2002 2065
2003 selinux [SELINUX] Disable or enable SELinux at boot time. 2066 security= [SECURITY] Choose a security module to enable at boot.
2067 If this boot parameter is not specified, only the first
2068 security module asking for security registration will be
2069 loaded. An invalid security module name will be treated
2070 as if no module has been chosen.
2071
2072 selinux= [SELINUX] Disable or enable SELinux at boot time.
2004 Format: { "0" | "1" } 2073 Format: { "0" | "1" }
2005 See security/selinux/Kconfig help text. 2074 See security/selinux/Kconfig help text.
2006 0 -- disable. 2075 0 -- disable.
@@ -2009,15 +2078,6 @@ and is between 256 and 4096 characters. It is defined in the file
2009 If enabled at boot time, /selinux/disable can be used 2078 If enabled at boot time, /selinux/disable can be used
2010 later to disable prior to initial policy load. 2079 later to disable prior to initial policy load.
2011 2080
2012 selinux_compat_net =
2013 [SELINUX] Set initial selinux_compat_net flag value.
2014 Format: { "0" | "1" }
2015 0 -- use new secmark-based packet controls
2016 1 -- use legacy packet controls
2017 Default value is 0 (preferred).
2018 Value can be changed at runtime via
2019 /selinux/compat_net.
2020
2021 serialnumber [BUGS=X86-32] 2081 serialnumber [BUGS=X86-32]
2022 2082
2023 shapers= [NET] 2083 shapers= [NET]
@@ -2329,6 +2389,8 @@ and is between 256 and 4096 characters. It is defined in the file
2329 2389
2330 tp720= [HW,PS2] 2390 tp720= [HW,PS2]
2331 2391
2392 trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size.
2393
2332 trix= [HW,OSS] MediaTrix AudioTrix Pro 2394 trix= [HW,OSS] MediaTrix AudioTrix Pro
2333 Format: 2395 Format:
2334 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2396 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
@@ -2364,7 +2426,7 @@ and is between 256 and 4096 characters. It is defined in the file
2364 reported either. 2426 reported either.
2365 2427
2366 unknown_nmi_panic 2428 unknown_nmi_panic
2367 [X86-32,X86-64] 2429 [X86]
2368 Set unknown_nmi_panic=1 early on boot. 2430 Set unknown_nmi_panic=1 early on boot.
2369 2431
2370 usbcore.autosuspend= 2432 usbcore.autosuspend=
@@ -2431,15 +2493,12 @@ and is between 256 and 4096 characters. It is defined in the file
2431 medium is write-protected). 2493 medium is write-protected).
2432 Example: quirks=0419:aaf5:rl,0421:0433:rc 2494 Example: quirks=0419:aaf5:rl,0421:0433:rc
2433 2495
2434 add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in 2496 vdso= [X86,SH]
2435 kernel's map of available physical RAM.
2436
2437 vdso= [X86-32,SH,x86-64]
2438 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2497 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2439 vdso=1: enable VDSO (default) 2498 vdso=1: enable VDSO (default)
2440 vdso=0: disable VDSO mapping 2499 vdso=0: disable VDSO mapping
2441 2500
2442 vdso32= [X86-32,X86-64] 2501 vdso32= [X86]
2443 vdso32=2: enable compat VDSO (default with COMPAT_VDSO) 2502 vdso32=2: enable compat VDSO (default with COMPAT_VDSO)
2444 vdso32=1: enable 32-bit VDSO (default) 2503 vdso32=1: enable 32-bit VDSO (default)
2445 vdso32=0: disable 32-bit VDSO mapping 2504 vdso32=0: disable 32-bit VDSO mapping
@@ -2472,6 +2531,31 @@ and is between 256 and 4096 characters. It is defined in the file
2472 vmpoff= [KNL,S390] Perform z/VM CP command after power off. 2531 vmpoff= [KNL,S390] Perform z/VM CP command after power off.
2473 Format: <command> 2532 Format: <command>
2474 2533
2534 vt.default_blu= [VT]
2535 Format: <blue0>,<blue1>,<blue2>,...,<blue15>
2536 Change the default blue palette of the console.
2537 This is a 16-member array composed of values
2538 ranging from 0-255.
2539
2540 vt.default_grn= [VT]
2541 Format: <green0>,<green1>,<green2>,...,<green15>
2542 Change the default green palette of the console.
2543 This is a 16-member array composed of values
2544 ranging from 0-255.
2545
2546 vt.default_red= [VT]
2547 Format: <red0>,<red1>,<red2>,...,<red15>
2548 Change the default red palette of the console.
2549 This is a 16-member array composed of values
2550 ranging from 0-255.
2551
2552 vt.default_utf8=
2553 [VT]
2554 Format=<0|1>
2555 Set system-wide default UTF-8 mode for all tty's.
2556 Default is 1, i.e. UTF-8 mode is enabled for all
2557 newly opened terminals.
2558
2475 waveartist= [HW,OSS] 2559 waveartist= [HW,OSS]
2476 Format: <io>,<irq>,<dma>,<dma2> 2560 Format: <io>,<irq>,<dma>,<dma2>
2477 2561
@@ -2484,6 +2568,10 @@ and is between 256 and 4096 characters. It is defined in the file
2484 wdt= [WDT] Watchdog 2568 wdt= [WDT] Watchdog
2485 See Documentation/watchdog/wdt.txt. 2569 See Documentation/watchdog/wdt.txt.
2486 2570
2571 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2572 default x2apic cluster mode on platforms
2573 supporting x2apic.
2574
2487 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. 2575 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
2488 xd_geo= See header of drivers/block/xd.c. 2576 xd_geo= See header of drivers/block/xd.c.
2489 2577
@@ -2491,9 +2579,6 @@ and is between 256 and 4096 characters. It is defined in the file
2491 Format: 2579 Format:
2492 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] 2580 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
2493 2581
2494 norandmaps Don't use address space randomization. Equivalent to
2495 echo 0 > /proc/sys/kernel/randomize_va_space
2496
2497______________________________________________________________________ 2582______________________________________________________________________
2498 2583
2499TODO: 2584TODO:
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 48b3de90eb1e..1e7a769a10f9 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler. After the probed instruction
212is single-stepped, Kprobe calls kp->post_handler. If a fault 212is single-stepped, Kprobe calls kp->post_handler. If a fault
213occurs during execution of kp->pre_handler or kp->post_handler, 213occurs during execution of kp->pre_handler or kp->post_handler,
214or during single-stepping of the probed instruction, Kprobes calls 214or during single-stepping of the probed instruction, Kprobes calls
215kp->fault_handler. Any or all handlers can be NULL. 215kp->fault_handler. Any or all handlers can be NULL. If kp->flags
216is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
217so, it's handlers aren't hit until calling enable_kprobe(kp).
216 218
217NOTE: 219NOTE:
2181. With the introduction of the "symbol_name" field to struct kprobe, 2201. With the introduction of the "symbol_name" field to struct kprobe,
@@ -363,6 +365,26 @@ probes) in the specified array, they clear the addr field of those
363incorrect probes. However, other probes in the array are 365incorrect probes. However, other probes in the array are
364unregistered correctly. 366unregistered correctly.
365 367
3684.7 disable_*probe
369
370#include <linux/kprobes.h>
371int disable_kprobe(struct kprobe *kp);
372int disable_kretprobe(struct kretprobe *rp);
373int disable_jprobe(struct jprobe *jp);
374
375Temporarily disables the specified *probe. You can enable it again by using
376enable_*probe(). You must specify the probe which has been registered.
377
3784.8 enable_*probe
379
380#include <linux/kprobes.h>
381int enable_kprobe(struct kprobe *kp);
382int enable_kretprobe(struct kretprobe *rp);
383int enable_jprobe(struct jprobe *jp);
384
385Enables *probe which has been disabled by disable_*probe(). You must specify
386the probe which has been registered.
387
3665. Kprobes Features and Limitations 3885. Kprobes Features and Limitations
367 389
368Kprobes allows multiple probes at the same address. Currently, 390Kprobes allows multiple probes at the same address. Currently,
@@ -500,10 +522,14 @@ the probe. If the probed function belongs to a module, the module name
500is also specified. Following columns show probe status. If the probe is on 522is also specified. Following columns show probe status. If the probe is on
501a virtual address that is no longer valid (module init sections, module 523a virtual address that is no longer valid (module init sections, module
502virtual addresses that correspond to modules that've been unloaded), 524virtual addresses that correspond to modules that've been unloaded),
503such probes are marked with [GONE]. 525such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED].
504 527
505/debug/kprobes/enabled: Turn kprobes ON/OFF 528/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
506 529
507Provides a knob to globally turn registered kprobes ON or OFF. By default, 530Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
508all kprobes are enabled. By echoing "0" to this file, all registered probes 531By default, all kprobes are enabled. By echoing "0" to this file, all
509will be disarmed, till such time a "1" is echoed to this file. 532registered probes will be disarmed, till such time a "1" is echoed to this
533file. Note that this knob just disarms and arms all kprobes and doesn't
534change each probe's disabling state. This means that disabled kprobes (marked
535[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
index 2b3a6b5260bf..5ee2a02b3b40 100644
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -1,9 +1,9 @@
1Acer Laptop WMI Extras Driver 1Acer Laptop WMI Extras Driver
2http://code.google.com/p/aceracpi 2http://code.google.com/p/aceracpi
3Version 0.2 3Version 0.3
418th August 2008 44th April 2009
5 5
6Copyright 2007-2008 Carlos Corbacho <carlos@strangeworlds.co.uk> 6Copyright 2007-2009 Carlos Corbacho <carlos@strangeworlds.co.uk>
7 7
8acer-wmi is a driver to allow you to control various parts of your Acer laptop 8acer-wmi is a driver to allow you to control various parts of your Acer laptop
9hardware under Linux which are exposed via ACPI-WMI. 9hardware under Linux which are exposed via ACPI-WMI.
@@ -36,6 +36,10 @@ not possible in kernel space from a 64 bit OS.
36Supported Hardware 36Supported Hardware
37****************** 37******************
38 38
39NOTE: The Acer Aspire One is not supported hardware. It cannot work with
40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
41blacklisted until that happens.
42
39Please see the website for the current list of known working hardare: 43Please see the website for the current list of known working hardare:
40 44
41http://code.google.com/p/aceracpi/wiki/SupportedHardware 45http://code.google.com/p/aceracpi/wiki/SupportedHardware
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 41bc99fa1884..e7e9a69069e1 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -1,7 +1,7 @@
1 ThinkPad ACPI Extras Driver 1 ThinkPad ACPI Extras Driver
2 2
3 Version 0.22 3 Version 0.23
4 November 23rd, 2008 4 April 10th, 2009
5 5
6 Borislav Deianov <borislav@users.sf.net> 6 Borislav Deianov <borislav@users.sf.net>
7 Henrique de Moraes Holschuh <hmh@hmh.eng.br> 7 Henrique de Moraes Holschuh <hmh@hmh.eng.br>
@@ -20,7 +20,8 @@ moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
20kernel 2.6.29 and release 0.22. 20kernel 2.6.29 and release 0.22.
21 21
22The driver is named "thinkpad-acpi". In some places, like module 22The driver is named "thinkpad-acpi". In some places, like module
23names, "thinkpad_acpi" is used because of userspace issues. 23names and log messages, "thinkpad_acpi" is used because of userspace
24issues.
24 25
25"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too 26"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
26long due to length limitations on some Linux kernel versions. 27long due to length limitations on some Linux kernel versions.
@@ -37,7 +38,7 @@ detailed description):
37 - ThinkLight on and off 38 - ThinkLight on and off
38 - limited docking and undocking 39 - limited docking and undocking
39 - UltraBay eject 40 - UltraBay eject
40 - CMOS control 41 - CMOS/UCMS control
41 - LED control 42 - LED control
42 - ACPI sounds 43 - ACPI sounds
43 - temperature sensors 44 - temperature sensors
@@ -46,6 +47,7 @@ detailed description):
46 - Volume control 47 - Volume control
47 - Fan control and monitoring: fan speed, fan enable/disable 48 - Fan control and monitoring: fan speed, fan enable/disable
48 - WAN enable and disable 49 - WAN enable and disable
50 - UWB enable and disable
49 51
50A compatibility table by model and feature is maintained on the web 52A compatibility table by model and feature is maintained on the web
51site, http://ibm-acpi.sf.net/. I appreciate any success or failure 53site, http://ibm-acpi.sf.net/. I appreciate any success or failure
@@ -53,7 +55,7 @@ reports, especially if they add to or correct the compatibility table.
53Please include the following information in your report: 55Please include the following information in your report:
54 56
55 - ThinkPad model name 57 - ThinkPad model name
56 - a copy of your DSDT, from /proc/acpi/dsdt 58 - a copy of your ACPI tables, using the "acpidump" utility
57 - a copy of the output of dmidecode, with serial numbers 59 - a copy of the output of dmidecode, with serial numbers
58 and UUIDs masked off 60 and UUIDs masked off
59 - which driver features work and which don't 61 - which driver features work and which don't
@@ -66,17 +68,18 @@ Installation
66------------ 68------------
67 69
68If you are compiling this driver as included in the Linux kernel 70If you are compiling this driver as included in the Linux kernel
69sources, simply enable the CONFIG_THINKPAD_ACPI option, and optionally 71sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
70enable the CONFIG_THINKPAD_ACPI_BAY option if you want the 72It is located on the menu path: "Device Drivers" -> "X86 Platform
71thinkpad-specific bay functionality. 73Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
74
72 75
73Features 76Features
74-------- 77--------
75 78
76The driver exports two different interfaces to userspace, which can be 79The driver exports two different interfaces to userspace, which can be
77used to access the features it provides. One is a legacy procfs-based 80used to access the features it provides. One is a legacy procfs-based
78interface, which will be removed at some time in the distant future. 81interface, which will be removed at some time in the future. The other
79The other is a new sysfs-based interface which is not complete yet. 82is a new sysfs-based interface which is not complete yet.
80 83
81The procfs interface creates the /proc/acpi/ibm directory. There is a 84The procfs interface creates the /proc/acpi/ibm directory. There is a
82file under that directory for each feature it supports. The procfs 85file under that directory for each feature it supports. The procfs
@@ -111,15 +114,17 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
111as a driver attribute (see below). 114as a driver attribute (see below).
112 115
113Sysfs driver attributes are on the driver's sysfs attribute space, 116Sysfs driver attributes are on the driver's sysfs attribute space,
114for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and 117for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
115/sys/bus/platform/drivers/thinkpad_hwmon/ 118/sys/bus/platform/drivers/thinkpad_hwmon/
116 119
117Sysfs device attributes are on the thinkpad_acpi device sysfs attribute 120Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
118space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/. 121space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
119 122
120Sysfs device attributes for the sensors and fan are on the 123Sysfs device attributes for the sensors and fan are on the
121thinkpad_hwmon device's sysfs attribute space, but you should locate it 124thinkpad_hwmon device's sysfs attribute space, but you should locate it
122looking for a hwmon device with the name attribute of "thinkpad". 125looking for a hwmon device with the name attribute of "thinkpad", or
126better yet, through libsensors.
127
123 128
124Driver version 129Driver version
125-------------- 130--------------
@@ -129,6 +134,7 @@ sysfs driver attribute: version
129 134
130The driver name and version. No commands can be written to this file. 135The driver name and version. No commands can be written to this file.
131 136
137
132Sysfs interface version 138Sysfs interface version
133----------------------- 139-----------------------
134 140
@@ -160,6 +166,7 @@ expect that an attribute might not be there, and deal with it properly
160(an attribute not being there *is* a valid way to make it clear that a 166(an attribute not being there *is* a valid way to make it clear that a
161feature is not available in sysfs). 167feature is not available in sysfs).
162 168
169
163Hot keys 170Hot keys
164-------- 171--------
165 172
@@ -172,17 +179,14 @@ system. Enabling the hotkey functionality of thinkpad-acpi signals the
172firmware that such a driver is present, and modifies how the ThinkPad 179firmware that such a driver is present, and modifies how the ThinkPad
173firmware will behave in many situations. 180firmware will behave in many situations.
174 181
175The driver enables the hot key feature automatically when loaded. The 182The driver enables the HKEY ("hot key") event reporting automatically
176feature can later be disabled and enabled back at runtime. The driver 183when loaded, and disables it when it is removed.
177will also restore the hot key feature to its previous state and mask
178when it is unloaded.
179 184
180When the hotkey feature is enabled and the hot key mask is set (see 185The driver will report HKEY events in the following format:
181below), the driver will report HKEY events in the following format:
182 186
183 ibm/hotkey HKEY 00000080 0000xxxx 187 ibm/hotkey HKEY 00000080 0000xxxx
184 188
185Some of these events refer to hot key presses, but not all. 189Some of these events refer to hot key presses, but not all of them.
186 190
187The driver will generate events over the input layer for hot keys and 191The driver will generate events over the input layer for hot keys and
188radio switches, and over the ACPI netlink layer for other events. The 192radio switches, and over the ACPI netlink layer for other events. The
@@ -214,13 +218,17 @@ procfs notes:
214 218
215The following commands can be written to the /proc/acpi/ibm/hotkey file: 219The following commands can be written to the /proc/acpi/ibm/hotkey file:
216 220
217 echo enable > /proc/acpi/ibm/hotkey -- enable the hot keys feature
218 echo disable > /proc/acpi/ibm/hotkey -- disable the hot keys feature
219 echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys 221 echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
220 echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys 222 echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
221 ... any other 8-hex-digit mask ... 223 ... any other 8-hex-digit mask ...
222 echo reset > /proc/acpi/ibm/hotkey -- restore the original mask 224 echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
223 225
226The following commands have been deprecated and will cause the kernel
227to log a warning:
228
229 echo enable > /proc/acpi/ibm/hotkey -- does nothing
230 echo disable > /proc/acpi/ibm/hotkey -- returns an error
231
224The procfs interface does not support NVRAM polling control. So as to 232The procfs interface does not support NVRAM polling control. So as to
225maintain maximum bug-to-bug compatibility, it does not report any masks, 233maintain maximum bug-to-bug compatibility, it does not report any masks,
226nor does it allow one to manipulate the hot key mask when the firmware 234nor does it allow one to manipulate the hot key mask when the firmware
@@ -229,12 +237,9 @@ does not support masks at all, even if NVRAM polling is in use.
229sysfs notes: 237sysfs notes:
230 238
231 hotkey_bios_enabled: 239 hotkey_bios_enabled:
232 Returns the status of the hot keys feature when 240 DEPRECATED, WILL BE REMOVED SOON.
233 thinkpad-acpi was loaded. Upon module unload, the hot
234 key feature status will be restored to this value.
235 241
236 0: hot keys were disabled 242 Returns 0.
237 1: hot keys were enabled (unusual)
238 243
239 hotkey_bios_mask: 244 hotkey_bios_mask:
240 Returns the hot keys mask when thinkpad-acpi was loaded. 245 Returns the hot keys mask when thinkpad-acpi was loaded.
@@ -242,13 +247,10 @@ sysfs notes:
242 to this value. 247 to this value.
243 248
244 hotkey_enable: 249 hotkey_enable:
245 Enables/disables the hot keys feature in the ACPI 250 DEPRECATED, WILL BE REMOVED SOON.
246 firmware, and reports current status of the hot keys
247 feature. Has no effect on the NVRAM hot key polling
248 functionality.
249 251
250 0: disables the hot keys feature / feature disabled 252 0: returns -EPERM
251 1: enables the hot keys feature / feature enabled 253 1: does nothing
252 254
253 hotkey_mask: 255 hotkey_mask:
254 bit mask to enable driver-handling (and depending on 256 bit mask to enable driver-handling (and depending on
@@ -618,6 +620,7 @@ For Lenovo models *with* ACPI backlight control:
618 and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process 620 and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process
619 these keys on userspace somehow (e.g. by calling xbacklight). 621 these keys on userspace somehow (e.g. by calling xbacklight).
620 622
623
621Bluetooth 624Bluetooth
622--------- 625---------
623 626
@@ -628,6 +631,9 @@ sysfs rfkill class: switch "tpacpi_bluetooth_sw"
628This feature shows the presence and current state of a ThinkPad 631This feature shows the presence and current state of a ThinkPad
629Bluetooth device in the internal ThinkPad CDC slot. 632Bluetooth device in the internal ThinkPad CDC slot.
630 633
634If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
635so it is kept across reboots and power-off.
636
631Procfs notes: 637Procfs notes:
632 638
633If Bluetooth is installed, the following commands can be used: 639If Bluetooth is installed, the following commands can be used:
@@ -652,6 +658,7 @@ Sysfs notes:
652 rfkill controller switch "tpacpi_bluetooth_sw": refer to 658 rfkill controller switch "tpacpi_bluetooth_sw": refer to
653 Documentation/rfkill.txt for details. 659 Documentation/rfkill.txt for details.
654 660
661
655Video output control -- /proc/acpi/ibm/video 662Video output control -- /proc/acpi/ibm/video
656-------------------------------------------- 663--------------------------------------------
657 664
@@ -693,11 +700,8 @@ Fn-F7 from working. This also disables the video output switching
693features of this driver, as it uses the same ACPI methods as 700features of this driver, as it uses the same ACPI methods as
694Fn-F7. Video switching on the console should still work. 701Fn-F7. Video switching on the console should still work.
695 702
696UPDATE: There's now a patch for the X.org Radeon driver which 703UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
697addresses this issue. Some people are reporting success with the patch
698while others are still having problems. For more information:
699 704
700https://bugs.freedesktop.org/show_bug.cgi?id=2000
701 705
702ThinkLight control 706ThinkLight control
703------------------ 707------------------
@@ -720,10 +724,11 @@ The ThinkLight sysfs interface is documented by the LED class
720documentation, in Documentation/leds-class.txt. The ThinkLight LED name 724documentation, in Documentation/leds-class.txt. The ThinkLight LED name
721is "tpacpi::thinklight". 725is "tpacpi::thinklight".
722 726
723Due to limitations in the sysfs LED class, if the status of the thinklight 727Due to limitations in the sysfs LED class, if the status of the ThinkLight
724cannot be read or if it is unknown, thinkpad-acpi will report it as "off". 728cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
725It is impossible to know if the status returned through sysfs is valid. 729It is impossible to know if the status returned through sysfs is valid.
726 730
731
727Docking / undocking -- /proc/acpi/ibm/dock 732Docking / undocking -- /proc/acpi/ibm/dock
728------------------------------------------ 733------------------------------------------
729 734
@@ -784,6 +789,7 @@ the only docking stations currently supported are the X-series
784UltraBase docks and "dumb" port replicators like the Mini Dock (the 789UltraBase docks and "dumb" port replicators like the Mini Dock (the
785latter don't need any ACPI support, actually). 790latter don't need any ACPI support, actually).
786 791
792
787UltraBay eject -- /proc/acpi/ibm/bay 793UltraBay eject -- /proc/acpi/ibm/bay
788------------------------------------ 794------------------------------------
789 795
@@ -847,8 +853,9 @@ supported. Use "eject2" instead of "eject" for the second bay.
847Note: the UltraBay eject support on the 600e/x, A22p and A3x is 853Note: the UltraBay eject support on the 600e/x, A22p and A3x is
848EXPERIMENTAL and may not work as expected. USE WITH CAUTION! 854EXPERIMENTAL and may not work as expected. USE WITH CAUTION!
849 855
850CMOS control 856
851------------ 857CMOS/UCMS control
858-----------------
852 859
853procfs: /proc/acpi/ibm/cmos 860procfs: /proc/acpi/ibm/cmos
854sysfs device attribute: cmos_command 861sysfs device attribute: cmos_command
@@ -882,6 +889,7 @@ The cmos command interface is prone to firmware split-brain problems, as
882in newer ThinkPads it is just a compatibility layer. Do not use it, it is 889in newer ThinkPads it is just a compatibility layer. Do not use it, it is
883exported just as a debug tool. 890exported just as a debug tool.
884 891
892
885LED control 893LED control
886----------- 894-----------
887 895
@@ -893,6 +901,17 @@ some older ThinkPad models, it is possible to query the status of the
893LED indicators as well. Newer ThinkPads cannot query the real status 901LED indicators as well. Newer ThinkPads cannot query the real status
894of the LED indicators. 902of the LED indicators.
895 903
904Because misuse of the LEDs could induce an unaware user to perform
905dangerous actions (like undocking or ejecting a bay device while the
906buses are still active), or mask an important alarm (such as a nearly
907empty battery, or a broken battery), access to most LEDs is
908restricted.
909
910Unrestricted access to all LEDs requires that thinkpad-acpi be
911compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
912Distributions must never enable this option. Individual users that
913are aware of the consequences are welcome to enabling it.
914
896procfs notes: 915procfs notes:
897 916
898The available commands are: 917The available commands are:
@@ -939,6 +958,7 @@ ThinkPad indicator LED should blink in hardware accelerated mode, use the
939"timer" trigger, and leave the delay_on and delay_off parameters set to 958"timer" trigger, and leave the delay_on and delay_off parameters set to
940zero (to request hardware acceleration autodetection). 959zero (to request hardware acceleration autodetection).
941 960
961
942ACPI sounds -- /proc/acpi/ibm/beep 962ACPI sounds -- /proc/acpi/ibm/beep
943---------------------------------- 963----------------------------------
944 964
@@ -968,6 +988,7 @@ X40:
968 16 - one medium-pitched beep repeating constantly, stop with 17 988 16 - one medium-pitched beep repeating constantly, stop with 17
969 17 - stop 16 989 17 - stop 16
970 990
991
971Temperature sensors 992Temperature sensors
972------------------- 993-------------------
973 994
@@ -1115,6 +1136,7 @@ registers contain the current battery capacity, etc. If you experiment
1115with this, do send me your results (including some complete dumps with 1136with this, do send me your results (including some complete dumps with
1116a description of the conditions when they were taken.) 1137a description of the conditions when they were taken.)
1117 1138
1139
1118LCD brightness control 1140LCD brightness control
1119---------------------- 1141----------------------
1120 1142
@@ -1124,10 +1146,9 @@ sysfs backlight device "thinkpad_screen"
1124This feature allows software control of the LCD brightness on ThinkPad 1146This feature allows software control of the LCD brightness on ThinkPad
1125models which don't have a hardware brightness slider. 1147models which don't have a hardware brightness slider.
1126 1148
1127It has some limitations: the LCD backlight cannot be actually turned on or 1149It has some limitations: the LCD backlight cannot be actually turned
1128off by this interface, and in many ThinkPad models, the "dim while on 1150on or off by this interface, it just controls the backlight brightness
1129battery" functionality will be enabled by the BIOS when this interface is 1151level.
1130used, and cannot be controlled.
1131 1152
1132On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control 1153On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
1133has eight brightness levels, ranging from 0 to 7. Some of the levels 1154has eight brightness levels, ranging from 0 to 7. Some of the levels
@@ -1136,10 +1157,15 @@ display backlight brightness control methods have 16 levels, ranging
1136from 0 to 15. 1157from 0 to 15.
1137 1158
1138There are two interfaces to the firmware for direct brightness control, 1159There are two interfaces to the firmware for direct brightness control,
1139EC and CMOS. To select which one should be used, use the 1160EC and UCMS (or CMOS). To select which one should be used, use the
1140brightness_mode module parameter: brightness_mode=1 selects EC mode, 1161brightness_mode module parameter: brightness_mode=1 selects EC mode,
1141brightness_mode=2 selects CMOS mode, brightness_mode=3 selects both EC 1162brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
1142and CMOS. The driver tries to auto-detect which interface to use. 1163mode with NVRAM backing (so that brightness changes are remembered
1164across shutdown/reboot).
1165
1166The driver tries to select which interface to use from a table of
1167defaults for each ThinkPad model. If it makes a wrong choice, please
1168report this as a bug, so that we can fix it.
1143 1169
1144When display backlight brightness controls are available through the 1170When display backlight brightness controls are available through the
1145standard ACPI interface, it is best to use it instead of this direct 1171standard ACPI interface, it is best to use it instead of this direct
@@ -1201,6 +1227,7 @@ WARNING:
1201 and maybe reduce the life of the backlight lamps by needlessly kicking 1227 and maybe reduce the life of the backlight lamps by needlessly kicking
1202 its level up and down at every change. 1228 its level up and down at every change.
1203 1229
1230
1204Volume control -- /proc/acpi/ibm/volume 1231Volume control -- /proc/acpi/ibm/volume
1205--------------------------------------- 1232---------------------------------------
1206 1233
@@ -1217,6 +1244,11 @@ distinct. The unmute the volume after the mute command, use either the
1217up or down command (the level command will not unmute the volume). 1244up or down command (the level command will not unmute the volume).
1218The current volume level and mute state is shown in the file. 1245The current volume level and mute state is shown in the file.
1219 1246
1247The ALSA mixer interface to this feature is still missing, but patches
1248to add it exist. That problem should be addressed in the not so
1249distant future.
1250
1251
1220Fan control and monitoring: fan speed, fan enable/disable 1252Fan control and monitoring: fan speed, fan enable/disable
1221--------------------------------------------------------- 1253---------------------------------------------------------
1222 1254
@@ -1383,8 +1415,11 @@ procfs: /proc/acpi/ibm/wan
1383sysfs device attribute: wwan_enable (deprecated) 1415sysfs device attribute: wwan_enable (deprecated)
1384sysfs rfkill class: switch "tpacpi_wwan_sw" 1416sysfs rfkill class: switch "tpacpi_wwan_sw"
1385 1417
1386This feature shows the presence and current state of a W-WAN (Sierra 1418This feature shows the presence and current state of the built-in
1387Wireless EV-DO) device. 1419Wireless WAN device.
1420
1421If the ThinkPad supports it, the WWAN state is stored in NVRAM,
1422so it is kept across reboots and power-off.
1388 1423
1389It was tested on a Lenovo ThinkPad X60. It should probably work on other 1424It was tested on a Lenovo ThinkPad X60. It should probably work on other
1390ThinkPad models which come with this module installed. 1425ThinkPad models which come with this module installed.
@@ -1413,6 +1448,7 @@ Sysfs notes:
1413 rfkill controller switch "tpacpi_wwan_sw": refer to 1448 rfkill controller switch "tpacpi_wwan_sw": refer to
1414 Documentation/rfkill.txt for details. 1449 Documentation/rfkill.txt for details.
1415 1450
1451
1416EXPERIMENTAL: UWB 1452EXPERIMENTAL: UWB
1417----------------- 1453-----------------
1418 1454
@@ -1431,6 +1467,7 @@ Sysfs notes:
1431 rfkill controller switch "tpacpi_uwb_sw": refer to 1467 rfkill controller switch "tpacpi_uwb_sw": refer to
1432 Documentation/rfkill.txt for details. 1468 Documentation/rfkill.txt for details.
1433 1469
1470
1434Multiple Commands, Module Parameters 1471Multiple Commands, Module Parameters
1435------------------------------------ 1472------------------------------------
1436 1473
@@ -1445,6 +1482,7 @@ for example:
1445 1482
1446 modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable 1483 modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
1447 1484
1485
1448Enabling debugging output 1486Enabling debugging output
1449------------------------- 1487-------------------------
1450 1488
@@ -1457,8 +1495,15 @@ will enable all debugging output classes. It takes a bitmask, so
1457to enable more than one output class, just add their values. 1495to enable more than one output class, just add their values.
1458 1496
1459 Debug bitmask Description 1497 Debug bitmask Description
1498 0x8000 Disclose PID of userspace programs
1499 accessing some functions of the driver
1460 0x0001 Initialization and probing 1500 0x0001 Initialization and probing
1461 0x0002 Removal 1501 0x0002 Removal
1502 0x0004 RF Transmitter control (RFKILL)
1503 (bluetooth, WWAN, UWB...)
1504 0x0008 HKEY event interface, hotkeys
1505 0x0010 Fan control
1506 0x0020 Backlight brightness
1462 1507
1463There is also a kernel build option to enable more debugging 1508There is also a kernel build option to enable more debugging
1464information, which may be necessary to debug driver problems. 1509information, which may be necessary to debug driver problems.
@@ -1467,6 +1512,7 @@ The level of debugging information output by the driver can be changed
1467at runtime through sysfs, using the driver attribute debug_level. The 1512at runtime through sysfs, using the driver attribute debug_level. The
1468attribute takes the same bitmask as the debug module parameter above. 1513attribute takes the same bitmask as the debug module parameter above.
1469 1514
1515
1470Force loading of module 1516Force loading of module
1471----------------------- 1517-----------------------
1472 1518
@@ -1505,3 +1551,7 @@ Sysfs interface changelog:
1505 1551
15060x020200: Add poll()/select() support to the following attributes: 15520x020200: Add poll()/select() support to the following attributes:
1507 hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason 1553 hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
1554
15550x020300: hotkey enable/disable support removed, attributes
1556 hotkey_bios_enabled and hotkey_enable deprecated and
1557 marked for removal.
diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore
new file mode 100644
index 000000000000..115587fd5f65
--- /dev/null
+++ b/Documentation/lguest/.gitignore
@@ -0,0 +1 @@
lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index f2dbbf3bdeab..d36fcc0f2715 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1630,6 +1630,13 @@ static bool service_io(struct device *dev)
1630 } 1630 }
1631 } 1631 }
1632 1632
1633 /* OK, so we noted that it was pretty poor to use an fdatasync as a
1634 * barrier. But Christoph Hellwig points out that we need a sync
1635 * *afterwards* as well: "Barriers specify no reordering to the front
1636 * or the back." And Jens Axboe confirmed it, so here we are: */
1637 if (out->type & VIRTIO_BLK_T_BARRIER)
1638 fdatasync(vblk->fd);
1639
1633 /* We can't trigger an IRQ, because we're not the Launcher. It does 1640 /* We can't trigger an IRQ, because we're not the Launcher. It does
1634 * that when we tell it we're done. */ 1641 * that when we tell it we're done. */
1635 add_used(dev->vq, head, wlen); 1642 add_used(dev->vq, head, wlen);
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 29510dc51510..28c747362f95 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -3,11 +3,11 @@
3 /, /` - or, A Young Coder's Illustrated Hypervisor 3 /, /` - or, A Young Coder's Illustrated Hypervisor
4 \\"--\\ http://lguest.ozlabs.org 4 \\"--\\ http://lguest.ozlabs.org
5 5
6Lguest is designed to be a minimal hypervisor for the Linux kernel, for 6Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
7Linux developers and users to experiment with virtualization with the 7for Linux developers and users to experiment with virtualization with the
8minimum of complexity. Nonetheless, it should have sufficient 8minimum of complexity. Nonetheless, it should have sufficient features to
9features to make it useful for specific tasks, and, of course, you are 9make it useful for specific tasks, and, of course, you are encouraged to fork
10encouraged to fork and enhance it (see drivers/lguest/README). 10and enhance it (see drivers/lguest/README).
11 11
12Features: 12Features:
13 13
@@ -37,6 +37,7 @@ Running Lguest:
37 "Paravirtualized guest support" = Y 37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y 38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB 39 "High Memory Support" = off/4GB
40 "PAE (Physical Address Extension) Support" = N
40 "Alignment value to which kernel should be aligned" = 0x100000 41 "Alignment value to which kernel should be aligned" = 0x100000
41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 42 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
42 CONFIG_PHYSICAL_ALIGN=0x100000) 43 CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index 488773018152..e20d913d5914 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -27,33 +27,37 @@ lock-class.
27State 27State
28----- 28-----
29 29
30The validator tracks lock-class usage history into 5 separate state bits: 30The validator tracks lock-class usage history into 4n + 1 separate state bits:
31 31
32- 'ever held in hardirq context' [ == hardirq-safe ] 32- 'ever held in STATE context'
33- 'ever held in softirq context' [ == softirq-safe ] 33- 'ever head as readlock in STATE context'
34- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] 34- 'ever head with STATE enabled'
35- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] 35- 'ever head as readlock with STATE enabled'
36
37Where STATE can be either one of (kernel/lockdep_states.h)
38 - hardirq
39 - softirq
40 - reclaim_fs
36 41
37- 'ever used' [ == !unused ] 42- 'ever used' [ == !unused ]
38 43
39When locking rules are violated, these 4 state bits are presented in the 44When locking rules are violated, these state bits are presented in the
40locking error messages, inside curlies. A contrived example: 45locking error messages, inside curlies. A contrived example:
41 46
42 modprobe/2287 is trying to acquire lock: 47 modprobe/2287 is trying to acquire lock:
43 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 48 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
44 49
45 but task is already holding lock: 50 but task is already holding lock:
46 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 51 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
47 52
48 53
49The bit position indicates hardirq, softirq, hardirq-read, 54The bit position indicates STATE, STATE-read, for each of the states listed
50softirq-read respectively, and the character displayed in each 55above, and the character displayed in each indicates:
51indicates:
52 56
53 '.' acquired while irqs disabled 57 '.' acquired while irqs disabled and not in irq context
54 '+' acquired in irq context 58 '-' acquired in irq context
55 '-' acquired with irqs enabled 59 '+' acquired with irqs enabled
56 '?' read acquired in irq context with irqs enabled. 60 '?' acquired in irq context with irqs enabled.
57 61
58Unused mutexes cannot be part of the cause of an error. 62Unused mutexes cannot be part of the cause of an error.
59 63
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 1da9d1b1793f..4edd39ec7db9 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -164,15 +164,19 @@ All md devices contain:
164 raid_disks 164 raid_disks
165 a text file with a simple number indicating the number of devices 165 a text file with a simple number indicating the number of devices
166 in a fully functional array. If this is not yet known, the file 166 in a fully functional array. If this is not yet known, the file
167 will be empty. If an array is being resized (not currently 167 will be empty. If an array is being resized this will contain
168 possible) this will contain the larger of the old and new sizes. 168 the new number of devices.
169 Some raid level (RAID1) allow this value to be set while the 169 Some raid levels allow this value to be set while the array is
170 array is active. This will reconfigure the array. Otherwise 170 active. This will reconfigure the array. Otherwise it can only
171 it can only be set while assembling an array. 171 be set while assembling an array.
172 A change to this attribute will not be permitted if it would
173 reduce the size of the array. To reduce the number of drives
174 in an e.g. raid5, the array size must first be reduced by
175 setting the 'array_size' attribute.
172 176
173 chunk_size 177 chunk_size
174 This is the size if bytes for 'chunks' and is only relevant to 178 This is the size in bytes for 'chunks' and is only relevant to
175 raid levels that involve striping (1,4,5,6,10). The address space 179 raid levels that involve striping (0,4,5,6,10). The address space
176 of the array is conceptually divided into chunks and consecutive 180 of the array is conceptually divided into chunks and consecutive
177 chunks are striped onto neighbouring devices. 181 chunks are striped onto neighbouring devices.
178 The size should be at least PAGE_SIZE (4k) and should be a power 182 The size should be at least PAGE_SIZE (4k) and should be a power
@@ -183,6 +187,20 @@ All md devices contain:
183 simply a number that is interpretted differently by different 187 simply a number that is interpretted differently by different
184 levels. It can be written while assembling an array. 188 levels. It can be written while assembling an array.
185 189
190 array_size
191 This can be used to artificially constrain the available space in
192 the array to be less than is actually available on the combined
193 devices. Writing a number (in Kilobytes) which is less than
194 the available size will set the size. Any reconfiguration of the
195 array (e.g. adding devices) will not cause the size to change.
196 Writing the word 'default' will cause the effective size of the
197 array to be whatever size is actually available based on
198 'level', 'chunk_size' and 'component_size'.
199
200 This can be used to reduce the size of the array before reducing
201 the number of devices in a raid4/5/6, or to support external
202 metadata formats which mandate such clipping.
203
186 reshape_position 204 reshape_position
187 This is either "none" or a sector number within the devices of 205 This is either "none" or a sector number within the devices of
188 the array where "reshape" is up to. If this is set, the three 206 the array where "reshape" is up to. If this is set, the three
@@ -207,6 +225,11 @@ All md devices contain:
207 about the array. It can be 0.90 (traditional format), 1.0, 1.1, 225 about the array. It can be 0.90 (traditional format), 1.0, 1.1,
208 1.2 (newer format in varying locations) or "none" indicating that 226 1.2 (newer format in varying locations) or "none" indicating that
209 the kernel isn't managing metadata at all. 227 the kernel isn't managing metadata at all.
228 Alternately it can be "external:" followed by a string which
229 is set by user-space. This indicates that metadata is managed
230 by a user-space program. Any device failure or other event that
231 requires a metadata update will cause array activity to be
232 suspended until the event is acknowledged.
210 233
211 resync_start 234 resync_start
212 The point at which resync should start. If no resync is needed, 235 The point at which resync should start. If no resync is needed,
diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003
new file mode 100644
index 000000000000..c4ff5f38e010
--- /dev/null
+++ b/Documentation/misc-devices/isl29003
@@ -0,0 +1,62 @@
1Kernel driver isl29003
2=====================
3
4Supported chips:
5* Intersil ISL29003
6Prefix: 'isl29003'
7Addresses scanned: none
8Datasheet:
9http://www.intersil.com/data/fn/fn7464.pdf
10
11Author: Daniel Mack <daniel@caiaq.de>
12
13
14Description
15-----------
16The ISL29003 is an integrated light sensor with a 16-bit integrating type
17ADC, I2C user programmable lux range select for optimized counts/lux, and
18I2C multi-function control and monitoring capabilities. The internal ADC
19provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
20artificial light sources.
21
22The driver allows to set the lux range, the bit resolution, the operational
23mode (see below) and the power state of device and can read the current lux
24value, of course.
25
26
27Detection
28---------
29
30The ISL29003 does not have an ID register which could be used to identify
31it, so the detection routine will just try to read from the configured I2C
32addess and consider the device to be present as soon as it ACKs the
33transfer.
34
35
36Sysfs entries
37-------------
38
39range:
40 0: 0 lux to 1000 lux (default)
41 1: 0 lux to 4000 lux
42 2: 0 lux to 16,000 lux
43 3: 0 lux to 64,000 lux
44
45resolution:
46 0: 2^16 cycles (default)
47 1: 2^12 cycles
48 2: 2^8 cycles
49 3: 2^4 cycles
50
51mode:
52 0: diode1's current (unsigned 16bit) (default)
53 1: diode1's current (unsigned 16bit)
54 2: difference between diodes (l1 - l2, signed 15bit)
55
56power_state:
57 0: device is disabled (default)
58 1: device is enabled
59
60lux (read only):
61 returns the value from the last sensor reading
62
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 5ede7473b425..08762750f121 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -1242,7 +1242,7 @@ monitoring is enabled, and vice-versa.
1242To add ARP targets: 1242To add ARP targets:
1243# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target 1243# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
1244# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target 1244# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target
1245 NOTE: up to 10 target addresses may be specified. 1245 NOTE: up to 16 target addresses may be specified.
1246 1246
1247To remove an ARP target: 1247To remove an ARP target:
1248# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target 1248# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 7a3bb1abb830..b132e4a3cf0f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -141,7 +141,8 @@ rx_ccid = 2
141 Default CCID for the receiver-sender half-connection; see tx_ccid. 141 Default CCID for the receiver-sender half-connection; see tx_ccid.
142 142
143seq_window = 100 143seq_window = 100
144 The initial sequence window (sec. 7.5.2). 144 The initial sequence window (sec. 7.5.2) of the sender. This influences
145 the local ackno validity and the remote seqno validity windows (7.5.1).
145 146
146tx_qlen = 5 147tx_qlen = 5
147 The size of the transmit buffer in packets. A value of 0 corresponds 148 The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c7712787933c..ec5de02f543f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -2,7 +2,7 @@
2 2
3ip_forward - BOOLEAN 3ip_forward - BOOLEAN
4 0 - disabled (default) 4 0 - disabled (default)
5 not 0 - enabled 5 not 0 - enabled
6 6
7 Forward Packets between interfaces. 7 Forward Packets between interfaces.
8 8
@@ -36,49 +36,49 @@ rt_cache_rebuild_count - INTEGER
36IP Fragmentation: 36IP Fragmentation:
37 37
38ipfrag_high_thresh - INTEGER 38ipfrag_high_thresh - INTEGER
39 Maximum memory used to reassemble IP fragments. When 39 Maximum memory used to reassemble IP fragments. When
40 ipfrag_high_thresh bytes of memory is allocated for this purpose, 40 ipfrag_high_thresh bytes of memory is allocated for this purpose,
41 the fragment handler will toss packets until ipfrag_low_thresh 41 the fragment handler will toss packets until ipfrag_low_thresh
42 is reached. 42 is reached.
43 43
44ipfrag_low_thresh - INTEGER 44ipfrag_low_thresh - INTEGER
45 See ipfrag_high_thresh 45 See ipfrag_high_thresh
46 46
47ipfrag_time - INTEGER 47ipfrag_time - INTEGER
48 Time in seconds to keep an IP fragment in memory. 48 Time in seconds to keep an IP fragment in memory.
49 49
50ipfrag_secret_interval - INTEGER 50ipfrag_secret_interval - INTEGER
51 Regeneration interval (in seconds) of the hash secret (or lifetime 51 Regeneration interval (in seconds) of the hash secret (or lifetime
52 for the hash secret) for IP fragments. 52 for the hash secret) for IP fragments.
53 Default: 600 53 Default: 600
54 54
55ipfrag_max_dist - INTEGER 55ipfrag_max_dist - INTEGER
56 ipfrag_max_dist is a non-negative integer value which defines the 56 ipfrag_max_dist is a non-negative integer value which defines the
57 maximum "disorder" which is allowed among fragments which share a 57 maximum "disorder" which is allowed among fragments which share a
58 common IP source address. Note that reordering of packets is 58 common IP source address. Note that reordering of packets is
59 not unusual, but if a large number of fragments arrive from a source 59 not unusual, but if a large number of fragments arrive from a source
60 IP address while a particular fragment queue remains incomplete, it 60 IP address while a particular fragment queue remains incomplete, it
61 probably indicates that one or more fragments belonging to that queue 61 probably indicates that one or more fragments belonging to that queue
62 have been lost. When ipfrag_max_dist is positive, an additional check 62 have been lost. When ipfrag_max_dist is positive, an additional check
63 is done on fragments before they are added to a reassembly queue - if 63 is done on fragments before they are added to a reassembly queue - if
64 ipfrag_max_dist (or more) fragments have arrived from a particular IP 64 ipfrag_max_dist (or more) fragments have arrived from a particular IP
65 address between additions to any IP fragment queue using that source 65 address between additions to any IP fragment queue using that source
66 address, it's presumed that one or more fragments in the queue are 66 address, it's presumed that one or more fragments in the queue are
67 lost. The existing fragment queue will be dropped, and a new one 67 lost. The existing fragment queue will be dropped, and a new one
68 started. An ipfrag_max_dist value of zero disables this check. 68 started. An ipfrag_max_dist value of zero disables this check.
69 69
70 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can 70 Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
71 result in unnecessarily dropping fragment queues when normal 71 result in unnecessarily dropping fragment queues when normal
72 reordering of packets occurs, which could lead to poor application 72 reordering of packets occurs, which could lead to poor application
73 performance. Using a very large value, e.g. 50000, increases the 73 performance. Using a very large value, e.g. 50000, increases the
74 likelihood of incorrectly reassembling IP fragments that originate 74 likelihood of incorrectly reassembling IP fragments that originate
75 from different IP datagrams, which could result in data corruption. 75 from different IP datagrams, which could result in data corruption.
76 Default: 64 76 Default: 64
77 77
78INET peer storage: 78INET peer storage:
79 79
80inet_peer_threshold - INTEGER 80inet_peer_threshold - INTEGER
81 The approximate size of the storage. Starting from this threshold 81 The approximate size of the storage. Starting from this threshold
82 entries will be thrown aggressively. This threshold also determines 82 entries will be thrown aggressively. This threshold also determines
83 entries' time-to-live and time intervals between garbage collection 83 entries' time-to-live and time intervals between garbage collection
84 passes. More entries, less time-to-live, less GC interval. 84 passes. More entries, less time-to-live, less GC interval.
@@ -105,7 +105,7 @@ inet_peer_gc_maxtime - INTEGER
105 in effect under low (or absent) memory pressure on the pool. 105 in effect under low (or absent) memory pressure on the pool.
106 Measured in seconds. 106 Measured in seconds.
107 107
108TCP variables: 108TCP variables:
109 109
110somaxconn - INTEGER 110somaxconn - INTEGER
111 Limit of socket listen() backlog, known in userspace as SOMAXCONN. 111 Limit of socket listen() backlog, known in userspace as SOMAXCONN.
@@ -310,7 +310,7 @@ tcp_orphan_retries - INTEGER
310 310
311tcp_reordering - INTEGER 311tcp_reordering - INTEGER
312 Maximal reordering of packets in a TCP stream. 312 Maximal reordering of packets in a TCP stream.
313 Default: 3 313 Default: 3
314 314
315tcp_retrans_collapse - BOOLEAN 315tcp_retrans_collapse - BOOLEAN
316 Bug-to-bug compatibility with some broken printers. 316 Bug-to-bug compatibility with some broken printers.
@@ -521,7 +521,7 @@ IP Variables:
521 521
522ip_local_port_range - 2 INTEGERS 522ip_local_port_range - 2 INTEGERS
523 Defines the local port range that is used by TCP and UDP to 523 Defines the local port range that is used by TCP and UDP to
524 choose the local port. The first number is the first, the 524 choose the local port. The first number is the first, the
525 second the last local port number. Default value depends on 525 second the last local port number. Default value depends on
526 amount of memory available on the system: 526 amount of memory available on the system:
527 > 128Mb 32768-61000 527 > 128Mb 32768-61000
@@ -594,12 +594,12 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
594 594
595 If zero, icmp error messages are sent with the primary address of 595 If zero, icmp error messages are sent with the primary address of
596 the exiting interface. 596 the exiting interface.
597 597
598 If non-zero, the message will be sent with the primary address of 598 If non-zero, the message will be sent with the primary address of
599 the interface that received the packet that caused the icmp error. 599 the interface that received the packet that caused the icmp error.
600 This is the behaviour network many administrators will expect from 600 This is the behaviour network many administrators will expect from
601 a router. And it can make debugging complicated network layouts 601 a router. And it can make debugging complicated network layouts
602 much easier. 602 much easier.
603 603
604 Note that if no primary address exists for the interface selected, 604 Note that if no primary address exists for the interface selected,
605 then the primary address of the first non-loopback interface that 605 then the primary address of the first non-loopback interface that
@@ -611,7 +611,7 @@ igmp_max_memberships - INTEGER
611 Change the maximum number of multicast groups we can subscribe to. 611 Change the maximum number of multicast groups we can subscribe to.
612 Default: 20 612 Default: 20
613 613
614conf/interface/* changes special settings per interface (where "interface" is 614conf/interface/* changes special settings per interface (where "interface" is
615 the name of your network interface) 615 the name of your network interface)
616conf/all/* is special, changes the settings for all interfaces 616conf/all/* is special, changes the settings for all interfaces
617 617
@@ -625,11 +625,11 @@ log_martians - BOOLEAN
625accept_redirects - BOOLEAN 625accept_redirects - BOOLEAN
626 Accept ICMP redirect messages. 626 Accept ICMP redirect messages.
627 accept_redirects for the interface will be enabled if: 627 accept_redirects for the interface will be enabled if:
628 - both conf/{all,interface}/accept_redirects are TRUE in the case forwarding 628 - both conf/{all,interface}/accept_redirects are TRUE in the case
629 for the interface is enabled 629 forwarding for the interface is enabled
630 or 630 or
631 - at least one of conf/{all,interface}/accept_redirects is TRUE in the case 631 - at least one of conf/{all,interface}/accept_redirects is TRUE in the
632 forwarding for the interface is disabled 632 case forwarding for the interface is disabled
633 accept_redirects for the interface will be disabled otherwise 633 accept_redirects for the interface will be disabled otherwise
634 default TRUE (host) 634 default TRUE (host)
635 FALSE (router) 635 FALSE (router)
@@ -640,8 +640,8 @@ forwarding - BOOLEAN
640mc_forwarding - BOOLEAN 640mc_forwarding - BOOLEAN
641 Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE 641 Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE
642 and a multicast routing daemon is required. 642 and a multicast routing daemon is required.
643 conf/all/mc_forwarding must also be set to TRUE to enable multicast routing 643 conf/all/mc_forwarding must also be set to TRUE to enable multicast
644 for the interface 644 routing for the interface
645 645
646medium_id - INTEGER 646medium_id - INTEGER
647 Integer value used to differentiate the devices by the medium they 647 Integer value used to differentiate the devices by the medium they
@@ -649,7 +649,7 @@ medium_id - INTEGER
649 the broadcast packets are received only on one of them. 649 the broadcast packets are received only on one of them.
650 The default value 0 means that the device is the only interface 650 The default value 0 means that the device is the only interface
651 to its medium, value of -1 means that medium is not known. 651 to its medium, value of -1 means that medium is not known.
652 652
653 Currently, it is used to change the proxy_arp behavior: 653 Currently, it is used to change the proxy_arp behavior:
654 the proxy_arp feature is enabled for packets forwarded between 654 the proxy_arp feature is enabled for packets forwarded between
655 two devices attached to different media. 655 two devices attached to different media.
@@ -699,16 +699,22 @@ accept_source_route - BOOLEAN
699 default TRUE (router) 699 default TRUE (router)
700 FALSE (host) 700 FALSE (host)
701 701
702rp_filter - BOOLEAN 702rp_filter - INTEGER
703 1 - do source validation by reversed path, as specified in RFC1812
704 Recommended option for single homed hosts and stub network
705 routers. Could cause troubles for complicated (not loop free)
706 networks running a slow unreliable protocol (sort of RIP),
707 or using static routes.
708
709 0 - No source validation. 703 0 - No source validation.
710 704 1 - Strict mode as defined in RFC3704 Strict Reverse Path
711 conf/all/rp_filter must also be set to TRUE to do source validation 705 Each incoming packet is tested against the FIB and if the interface
706 is not the best reverse path the packet check will fail.
707 By default failed packets are discarded.
708 2 - Loose mode as defined in RFC3704 Loose Reverse Path
709 Each incoming packet's source address is also tested against the FIB
710 and if the source address is not reachable via any interface
711 the packet check will fail.
712
713 Current recommended practice in RFC3704 is to enable strict mode
714 to prevent IP spoofing from DDos attacks. If using asymmetric routing
715 or other complicated routing, then loose mode is recommended.
716
717 conf/all/rp_filter must also be set to non-zero to do source validation
712 on the interface 718 on the interface
713 719
714 Default value is 0. Note that some distributions enable it 720 Default value is 0. Note that some distributions enable it
@@ -782,6 +788,12 @@ arp_ignore - INTEGER
782 The max value from conf/{all,interface}/arp_ignore is used 788 The max value from conf/{all,interface}/arp_ignore is used
783 when ARP request is received on the {interface} 789 when ARP request is received on the {interface}
784 790
791arp_notify - BOOLEAN
792 Define mode for notification of address and device changes.
793 0 - (default): do nothing
794 1 - Generate gratuitous arp replies when device is brought up
795 or hardware address changes.
796
785arp_accept - BOOLEAN 797arp_accept - BOOLEAN
786 Define behavior when gratuitous arp replies are received: 798 Define behavior when gratuitous arp replies are received:
787 0 - drop gratuitous arp frames 799 0 - drop gratuitous arp frames
@@ -823,7 +835,7 @@ apply to IPv6 [XXX?].
823 835
824bindv6only - BOOLEAN 836bindv6only - BOOLEAN
825 Default value for IPV6_V6ONLY socket option, 837 Default value for IPV6_V6ONLY socket option,
826 which restricts use of the IPv6 socket to IPv6 communication 838 which restricts use of the IPv6 socket to IPv6 communication
827 only. 839 only.
828 TRUE: disable IPv4-mapped address feature 840 TRUE: disable IPv4-mapped address feature
829 FALSE: enable IPv4-mapped address feature 841 FALSE: enable IPv4-mapped address feature
@@ -833,19 +845,19 @@ bindv6only - BOOLEAN
833IPv6 Fragmentation: 845IPv6 Fragmentation:
834 846
835ip6frag_high_thresh - INTEGER 847ip6frag_high_thresh - INTEGER
836 Maximum memory used to reassemble IPv6 fragments. When 848 Maximum memory used to reassemble IPv6 fragments. When
837 ip6frag_high_thresh bytes of memory is allocated for this purpose, 849 ip6frag_high_thresh bytes of memory is allocated for this purpose,
838 the fragment handler will toss packets until ip6frag_low_thresh 850 the fragment handler will toss packets until ip6frag_low_thresh
839 is reached. 851 is reached.
840 852
841ip6frag_low_thresh - INTEGER 853ip6frag_low_thresh - INTEGER
842 See ip6frag_high_thresh 854 See ip6frag_high_thresh
843 855
844ip6frag_time - INTEGER 856ip6frag_time - INTEGER
845 Time in seconds to keep an IPv6 fragment in memory. 857 Time in seconds to keep an IPv6 fragment in memory.
846 858
847ip6frag_secret_interval - INTEGER 859ip6frag_secret_interval - INTEGER
848 Regeneration interval (in seconds) of the hash secret (or lifetime 860 Regeneration interval (in seconds) of the hash secret (or lifetime
849 for the hash secret) for IPv6 fragments. 861 for the hash secret) for IPv6 fragments.
850 Default: 600 862 Default: 600
851 863
@@ -854,17 +866,17 @@ conf/default/*:
854 866
855 867
856conf/all/*: 868conf/all/*:
857 Change all the interface-specific settings. 869 Change all the interface-specific settings.
858 870
859 [XXX: Other special features than forwarding?] 871 [XXX: Other special features than forwarding?]
860 872
861conf/all/forwarding - BOOLEAN 873conf/all/forwarding - BOOLEAN
862 Enable global IPv6 forwarding between all interfaces. 874 Enable global IPv6 forwarding between all interfaces.
863 875
864 IPv4 and IPv6 work differently here; e.g. netfilter must be used 876 IPv4 and IPv6 work differently here; e.g. netfilter must be used
865 to control which interfaces may forward packets and which not. 877 to control which interfaces may forward packets and which not.
866 878
867 This also sets all interfaces' Host/Router setting 879 This also sets all interfaces' Host/Router setting
868 'forwarding' to the specified value. See below for details. 880 'forwarding' to the specified value. See below for details.
869 881
870 This referred to as global forwarding. 882 This referred to as global forwarding.
@@ -875,12 +887,12 @@ proxy_ndp - BOOLEAN
875conf/interface/*: 887conf/interface/*:
876 Change special settings per interface. 888 Change special settings per interface.
877 889
878 The functional behaviour for certain settings is different 890 The functional behaviour for certain settings is different
879 depending on whether local forwarding is enabled or not. 891 depending on whether local forwarding is enabled or not.
880 892
881accept_ra - BOOLEAN 893accept_ra - BOOLEAN
882 Accept Router Advertisements; autoconfigure using them. 894 Accept Router Advertisements; autoconfigure using them.
883 895
884 Functional default: enabled if local forwarding is disabled. 896 Functional default: enabled if local forwarding is disabled.
885 disabled if local forwarding is enabled. 897 disabled if local forwarding is enabled.
886 898
@@ -926,7 +938,7 @@ accept_source_route - INTEGER
926 Default: 0 938 Default: 0
927 939
928autoconf - BOOLEAN 940autoconf - BOOLEAN
929 Autoconfigure addresses using Prefix Information in Router 941 Autoconfigure addresses using Prefix Information in Router
930 Advertisements. 942 Advertisements.
931 943
932 Functional default: enabled if accept_ra_pinfo is enabled. 944 Functional default: enabled if accept_ra_pinfo is enabled.
@@ -935,11 +947,11 @@ autoconf - BOOLEAN
935dad_transmits - INTEGER 947dad_transmits - INTEGER
936 The amount of Duplicate Address Detection probes to send. 948 The amount of Duplicate Address Detection probes to send.
937 Default: 1 949 Default: 1
938 950
939forwarding - BOOLEAN 951forwarding - BOOLEAN
940 Configure interface-specific Host/Router behaviour. 952 Configure interface-specific Host/Router behaviour.
941 953
942 Note: It is recommended to have the same setting on all 954 Note: It is recommended to have the same setting on all
943 interfaces; mixed router/host scenarios are rather uncommon. 955 interfaces; mixed router/host scenarios are rather uncommon.
944 956
945 FALSE: 957 FALSE:
@@ -948,13 +960,13 @@ forwarding - BOOLEAN
948 960
949 1. IsRouter flag is not set in Neighbour Advertisements. 961 1. IsRouter flag is not set in Neighbour Advertisements.
950 2. Router Solicitations are being sent when necessary. 962 2. Router Solicitations are being sent when necessary.
951 3. If accept_ra is TRUE (default), accept Router 963 3. If accept_ra is TRUE (default), accept Router
952 Advertisements (and do autoconfiguration). 964 Advertisements (and do autoconfiguration).
953 4. If accept_redirects is TRUE (default), accept Redirects. 965 4. If accept_redirects is TRUE (default), accept Redirects.
954 966
955 TRUE: 967 TRUE:
956 968
957 If local forwarding is enabled, Router behaviour is assumed. 969 If local forwarding is enabled, Router behaviour is assumed.
958 This means exactly the reverse from the above: 970 This means exactly the reverse from the above:
959 971
960 1. IsRouter flag is set in Neighbour Advertisements. 972 1. IsRouter flag is set in Neighbour Advertisements.
@@ -989,7 +1001,7 @@ router_solicitation_interval - INTEGER
989 Default: 4 1001 Default: 4
990 1002
991router_solicitations - INTEGER 1003router_solicitations - INTEGER
992 Number of Router Solicitations to send until assuming no 1004 Number of Router Solicitations to send until assuming no
993 routers are present. 1005 routers are present.
994 Default: 3 1006 Default: 3
995 1007
@@ -1013,11 +1025,11 @@ temp_prefered_lft - INTEGER
1013 1025
1014max_desync_factor - INTEGER 1026max_desync_factor - INTEGER
1015 Maximum value for DESYNC_FACTOR, which is a random value 1027 Maximum value for DESYNC_FACTOR, which is a random value
1016 that ensures that clients don't synchronize with each 1028 that ensures that clients don't synchronize with each
1017 other and generate new addresses at exactly the same time. 1029 other and generate new addresses at exactly the same time.
1018 value is in seconds. 1030 value is in seconds.
1019 Default: 600 1031 Default: 600
1020 1032
1021regen_max_retry - INTEGER 1033regen_max_retry - INTEGER
1022 Number of attempts before give up attempting to generate 1034 Number of attempts before give up attempting to generate
1023 valid temporary addresses. 1035 valid temporary addresses.
@@ -1025,13 +1037,15 @@ regen_max_retry - INTEGER
1025 1037
1026max_addresses - INTEGER 1038max_addresses - INTEGER
1027 Number of maximum addresses per interface. 0 disables limitation. 1039 Number of maximum addresses per interface. 0 disables limitation.
1028 It is recommended not set too large value (or 0) because it would 1040 It is recommended not set too large value (or 0) because it would
1029 be too easy way to crash kernel to allow to create too much of 1041 be too easy way to crash kernel to allow to create too much of
1030 autoconfigured addresses. 1042 autoconfigured addresses.
1031 Default: 16 1043 Default: 16
1032 1044
1033disable_ipv6 - BOOLEAN 1045disable_ipv6 - BOOLEAN
1034 Disable IPv6 operation. 1046 Disable IPv6 operation. If accept_dad is set to 2, this value
1047 will be dynamically set to TRUE if DAD fails for the link-local
1048 address.
1035 Default: FALSE (enable IPv6 operation) 1049 Default: FALSE (enable IPv6 operation)
1036 1050
1037accept_dad - INTEGER 1051accept_dad - INTEGER
diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
new file mode 100644
index 000000000000..268e5c103dd8
--- /dev/null
+++ b/Documentation/networking/ipv6.txt
@@ -0,0 +1,35 @@
1
2Options for the ipv6 module are supplied as parameters at load time.
3
4Module options may be given as command line arguments to the insmod
5or modprobe command, but are usually specified in either the
6/etc/modules.conf or /etc/modprobe.conf configuration file, or in a
7distro-specific configuration file.
8
9The available ipv6 module parameters are listed below. If a parameter
10is not specified the default value is used.
11
12The parameters are as follows:
13
14disable
15
16 Specifies whether to load the IPv6 module, but disable all
17 its functionality. This might be used when another module
18 has a dependency on the IPv6 module being loaded, but no
19 IPv6 addresses or operations are desired.
20
21 The possible values and their effects are:
22
23 0
24 IPv6 is enabled.
25
26 This is the default value.
27
28 1
29 IPv6 is disabled.
30
31 No IPv6 addresses will be added to interfaces, and
32 it will not be possible to open an IPv6 socket.
33
34 A reboot is required to enable IPv6.
35
diff --git a/Documentation/networking/ixgbe.txt b/Documentation/networking/ixgbe.txt
new file mode 100644
index 000000000000..eeb68685c788
--- /dev/null
+++ b/Documentation/networking/ixgbe.txt
@@ -0,0 +1,199 @@
1Linux Base Driver for 10 Gigabit PCI Express Intel(R) Network Connection
2========================================================================
3
4March 10, 2009
5
6
7Contents
8========
9
10- In This Release
11- Identifying Your Adapter
12- Building and Installation
13- Additional Configurations
14- Support
15
16
17
18In This Release
19===============
20
21This file describes the ixgbe Linux Base Driver for the 10 Gigabit PCI
22Express Intel(R) Network Connection. This driver includes support for
23Itanium(R)2-based systems.
24
25For questions related to hardware requirements, refer to the documentation
26supplied with your 10 Gigabit adapter. All hardware requirements listed apply
27to use with Linux.
28
29The following features are available in this kernel:
30 - Native VLANs
31 - Channel Bonding (teaming)
32 - SNMP
33 - Generic Receive Offload
34 - Data Center Bridging
35
36Channel Bonding documentation can be found in the Linux kernel source:
37/Documentation/networking/bonding.txt
38
39Ethtool, lspci, and ifconfig can be used to display device and driver
40specific information.
41
42
43Identifying Your Adapter
44========================
45
46This driver supports devices based on the 82598 controller and the 82599
47controller.
48
49For specific information on identifying which adapter you have, please visit:
50
51 http://support.intel.com/support/network/sb/CS-008441.htm
52
53
54Building and Installation
55=========================
56
57select m for "Intel(R) 10GbE PCI Express adapters support" located at:
58 Location:
59 -> Device Drivers
60 -> Network device support (NETDEVICES [=y])
61 -> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
62
631. make modules & make modules_install
64
652. Load the module:
66
67# modprobe ixgbe
68
69 The insmod command can be used if the full
70 path to the driver module is specified. For example:
71
72 insmod /lib/modules/<KERNEL VERSION>/kernel/drivers/net/ixgbe/ixgbe.ko
73
74 With 2.6 based kernels also make sure that older ixgbe drivers are
75 removed from the kernel, before loading the new module:
76
77 rmmod ixgbe; modprobe ixgbe
78
793. Assign an IP address to the interface by entering the following, where
80 x is the interface number:
81
82 ifconfig ethx <IP_address>
83
844. Verify that the interface works. Enter the following, where <IP_address>
85 is the IP address for another machine on the same subnet as the interface
86 that is being tested:
87
88 ping <IP_address>
89
90
91Additional Configurations
92=========================
93
94 Viewing Link Messages
95 ---------------------
96 Link messages will not be displayed to the console if the distribution is
97 restricting system messages. In order to see network driver link messages on
98 your console, set dmesg to eight by entering the following:
99
100 dmesg -n 8
101
102 NOTE: This setting is not saved across reboots.
103
104
105 Jumbo Frames
106 ------------
107 The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
108 enabled by changing the MTU to a value larger than the default of 1500.
109 The maximum value for the MTU is 16110. Use the ifconfig command to
110 increase the MTU size. For example:
111
112 ifconfig ethx mtu 9000 up
113
114 The maximum MTU setting for Jumbo Frames is 16110. This value coincides
115 with the maximum Jumbo Frames size of 16128.
116
117 Generic Receive Offload, aka GRO
118 --------------------------------
119 The driver supports the in-kernel software implementation of GRO. GRO has
120 shown that by coalescing Rx traffic into larger chunks of data, CPU
121 utilization can be significantly reduced when under large Rx load. GRO is an
122 evolution of the previously-used LRO interface. GRO is able to coalesce
123 other protocols besides TCP. It's also safe to use with configurations that
124 are problematic for LRO, namely bridging and iSCSI.
125
126 GRO is enabled by default in the driver. Future versions of ethtool will
127 support disabling and re-enabling GRO on the fly.
128
129
130 Data Center Bridging, aka DCB
131 -----------------------------
132
133 DCB is a configuration Quality of Service implementation in hardware.
134 It uses the VLAN priority tag (802.1p) to filter traffic. That means
135 that there are 8 different priorities that traffic can be filtered into.
136 It also enables priority flow control which can limit or eliminate the
137 number of dropped packets during network stress. Bandwidth can be
138 allocated to each of these priorities, which is enforced at the hardware
139 level.
140
141 To enable DCB support in ixgbe, you must enable the DCB netlink layer to
142 allow the userspace tools (see below) to communicate with the driver.
143 This can be found in the kernel configuration here:
144
145 -> Networking support
146 -> Networking options
147 -> Data Center Bridging support
148
149 Once this is selected, DCB support must be selected for ixgbe. This can
150 be found here:
151
152 -> Device Drivers
153 -> Network device support (NETDEVICES [=y])
154 -> Ethernet (10000 Mbit) (NETDEV_10000 [=y])
155 -> Intel(R) 10GbE PCI Express adapters support
156 -> Data Center Bridging (DCB) Support
157
158 After these options are selected, you must rebuild your kernel and your
159 modules.
160
161 In order to use DCB, userspace tools must be downloaded and installed.
162 The dcbd tools can be found at:
163
164 http://e1000.sf.net
165
166
167 Ethtool
168 -------
169 The driver utilizes the ethtool interface for driver configuration and
170 diagnostics, as well as displaying statistical information. Ethtool
171 version 3.0 or later is required for this functionality.
172
173 The latest release of ethtool can be found from
174 http://sourceforge.net/projects/gkernel.
175
176
177 NAPI
178 ----
179
180 NAPI (Rx polling mode) is supported in the ixgbe driver. NAPI is enabled
181 by default in the driver.
182
183 See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
184
185
186Support
187=======
188
189For general information, go to the Intel support website at:
190
191 http://support.intel.com
192
193or the Intel Wired Networking project hosted by Sourceforge at:
194
195 http://e1000.sourceforge.net
196
197If an issue is identified with the released source code on the supported
198kernel with a supported adapter, email the specific information related
199to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt
new file mode 100644
index 000000000000..c67077cbeb80
--- /dev/null
+++ b/Documentation/networking/rds.txt
@@ -0,0 +1,356 @@
1
2Overview
3========
4
5This readme tries to provide some background on the hows and whys of RDS,
6and will hopefully help you find your way around the code.
7
8In addition, please see this email about RDS origins:
9http://oss.oracle.com/pipermail/rds-devel/2007-November/000228.html
10
11RDS Architecture
12================
13
14RDS provides reliable, ordered datagram delivery by using a single
15reliable connection between any two nodes in the cluster. This allows
16applications to use a single socket to talk to any other process in the
17cluster - so in a cluster with N processes you need N sockets, in contrast
18to N*N if you use a connection-oriented socket transport like TCP.
19
20RDS is not Infiniband-specific; it was designed to support different
21transports. The current implementation used to support RDS over TCP as well
22as IB. Work is in progress to support RDS over iWARP, and using DCE to
23guarantee no dropped packets on Ethernet, it may be possible to use RDS over
24UDP in the future.
25
26The high-level semantics of RDS from the application's point of view are
27
28 * Addressing
29 RDS uses IPv4 addresses and 16bit port numbers to identify
30 the end point of a connection. All socket operations that involve
31 passing addresses between kernel and user space generally
32 use a struct sockaddr_in.
33
34 The fact that IPv4 addresses are used does not mean the underlying
35 transport has to be IP-based. In fact, RDS over IB uses a
36 reliable IB connection; the IP address is used exclusively to
37 locate the remote node's GID (by ARPing for the given IP).
38
39 The port space is entirely independent of UDP, TCP or any other
40 protocol.
41
42 * Socket interface
43 RDS sockets work *mostly* as you would expect from a BSD
44 socket. The next section will cover the details. At any rate,
45 all I/O is performed through the standard BSD socket API.
46 Some additions like zerocopy support are implemented through
47 control messages, while other extensions use the getsockopt/
48 setsockopt calls.
49
50 Sockets must be bound before you can send or receive data.
51 This is needed because binding also selects a transport and
52 attaches it to the socket. Once bound, the transport assignment
53 does not change. RDS will tolerate IPs moving around (eg in
54 a active-active HA scenario), but only as long as the address
55 doesn't move to a different transport.
56
57 * sysctls
58 RDS supports a number of sysctls in /proc/sys/net/rds
59
60
61Socket Interface
62================
63
64 AF_RDS, PF_RDS, SOL_RDS
65 These constants haven't been assigned yet, because RDS isn't in
66 mainline yet. Currently, the kernel module assigns some constant
67 and publishes it to user space through two sysctl files
68 /proc/sys/net/rds/pf_rds
69 /proc/sys/net/rds/sol_rds
70
71 fd = socket(PF_RDS, SOCK_SEQPACKET, 0);
72 This creates a new, unbound RDS socket.
73
74 setsockopt(SOL_SOCKET): send and receive buffer size
75 RDS honors the send and receive buffer size socket options.
76 You are not allowed to queue more than SO_SNDSIZE bytes to
77 a socket. A message is queued when sendmsg is called, and
78 it leaves the queue when the remote system acknowledges
79 its arrival.
80
81 The SO_RCVSIZE option controls the maximum receive queue length.
82 This is a soft limit rather than a hard limit - RDS will
83 continue to accept and queue incoming messages, even if that
84 takes the queue length over the limit. However, it will also
85 mark the port as "congested" and send a congestion update to
86 the source node. The source node is supposed to throttle any
87 processes sending to this congested port.
88
89 bind(fd, &sockaddr_in, ...)
90 This binds the socket to a local IP address and port, and a
91 transport.
92
93 sendmsg(fd, ...)
94 Sends a message to the indicated recipient. The kernel will
95 transparently establish the underlying reliable connection
96 if it isn't up yet.
97
98 An attempt to send a message that exceeds SO_SNDSIZE will
99 return with -EMSGSIZE
100
101 An attempt to send a message that would take the total number
102 of queued bytes over the SO_SNDSIZE threshold will return
103 EAGAIN.
104
105 An attempt to send a message to a destination that is marked
106 as "congested" will return ENOBUFS.
107
108 recvmsg(fd, ...)
109 Receives a message that was queued to this socket. The sockets
110 recv queue accounting is adjusted, and if the queue length
111 drops below SO_SNDSIZE, the port is marked uncongested, and
112 a congestion update is sent to all peers.
113
114 Applications can ask the RDS kernel module to receive
115 notifications via control messages (for instance, there is a
116 notification when a congestion update arrived, or when a RDMA
117 operation completes). These notifications are received through
118 the msg.msg_control buffer of struct msghdr. The format of the
119 messages is described in manpages.
120
121 poll(fd)
122 RDS supports the poll interface to allow the application
123 to implement async I/O.
124
125 POLLIN handling is pretty straightforward. When there's an
126 incoming message queued to the socket, or a pending notification,
127 we signal POLLIN.
128
129 POLLOUT is a little harder. Since you can essentially send
130 to any destination, RDS will always signal POLLOUT as long as
131 there's room on the send queue (ie the number of bytes queued
132 is less than the sendbuf size).
133
134 However, the kernel will refuse to accept messages to
135 a destination marked congested - in this case you will loop
136 forever if you rely on poll to tell you what to do.
137 This isn't a trivial problem, but applications can deal with
138 this - by using congestion notifications, and by checking for
139 ENOBUFS errors returned by sendmsg.
140
141 setsockopt(SOL_RDS, RDS_CANCEL_SENT_TO, &sockaddr_in)
142 This allows the application to discard all messages queued to a
143 specific destination on this particular socket.
144
145 This allows the application to cancel outstanding messages if
146 it detects a timeout. For instance, if it tried to send a message,
147 and the remote host is unreachable, RDS will keep trying forever.
148 The application may decide it's not worth it, and cancel the
149 operation. In this case, it would use RDS_CANCEL_SENT_TO to
150 nuke any pending messages.
151
152
153RDMA for RDS
154============
155
156 see rds-rdma(7) manpage (available in rds-tools)
157
158
159Congestion Notifications
160========================
161
162 see rds(7) manpage
163
164
165RDS Protocol
166============
167
168 Message header
169
170 The message header is a 'struct rds_header' (see rds.h):
171 Fields:
172 h_sequence:
173 per-packet sequence number
174 h_ack:
175 piggybacked acknowledgment of last packet received
176 h_len:
177 length of data, not including header
178 h_sport:
179 source port
180 h_dport:
181 destination port
182 h_flags:
183 CONG_BITMAP - this is a congestion update bitmap
184 ACK_REQUIRED - receiver must ack this packet
185 RETRANSMITTED - packet has previously been sent
186 h_credit:
187 indicate to other end of connection that
188 it has more credits available (i.e. there is
189 more send room)
190 h_padding[4]:
191 unused, for future use
192 h_csum:
193 header checksum
194 h_exthdr:
195 optional data can be passed here. This is currently used for
196 passing RDMA-related information.
197
198 ACK and retransmit handling
199
200 One might think that with reliable IB connections you wouldn't need
201 to ack messages that have been received. The problem is that IB
202 hardware generates an ack message before it has DMAed the message
203 into memory. This creates a potential message loss if the HCA is
204 disabled for any reason between when it sends the ack and before
205 the message is DMAed and processed. This is only a potential issue
206 if another HCA is available for fail-over.
207
208 Sending an ack immediately would allow the sender to free the sent
209 message from their send queue quickly, but could cause excessive
210 traffic to be used for acks. RDS piggybacks acks on sent data
211 packets. Ack-only packets are reduced by only allowing one to be
212 in flight at a time, and by the sender only asking for acks when
213 its send buffers start to fill up. All retransmissions are also
214 acked.
215
216 Flow Control
217
218 RDS's IB transport uses a credit-based mechanism to verify that
219 there is space in the peer's receive buffers for more data. This
220 eliminates the need for hardware retries on the connection.
221
222 Congestion
223
224 Messages waiting in the receive queue on the receiving socket
225 are accounted against the sockets SO_RCVBUF option value. Only
226 the payload bytes in the message are accounted for. If the
227 number of bytes queued equals or exceeds rcvbuf then the socket
228 is congested. All sends attempted to this socket's address
229 should return block or return -EWOULDBLOCK.
230
231 Applications are expected to be reasonably tuned such that this
232 situation very rarely occurs. An application encountering this
233 "back-pressure" is considered a bug.
234
235 This is implemented by having each node maintain bitmaps which
236 indicate which ports on bound addresses are congested. As the
237 bitmap changes it is sent through all the connections which
238 terminate in the local address of the bitmap which changed.
239
240 The bitmaps are allocated as connections are brought up. This
241 avoids allocation in the interrupt handling path which queues
242 sages on sockets. The dense bitmaps let transports send the
243 entire bitmap on any bitmap change reasonably efficiently. This
244 is much easier to implement than some finer-grained
245 communication of per-port congestion. The sender does a very
246 inexpensive bit test to test if the port it's about to send to
247 is congested or not.
248
249
250RDS Transport Layer
251==================
252
253 As mentioned above, RDS is not IB-specific. Its code is divided
254 into a general RDS layer and a transport layer.
255
256 The general layer handles the socket API, congestion handling,
257 loopback, stats, usermem pinning, and the connection state machine.
258
259 The transport layer handles the details of the transport. The IB
260 transport, for example, handles all the queue pairs, work requests,
261 CM event handlers, and other Infiniband details.
262
263
264RDS Kernel Structures
265=====================
266
267 struct rds_message
268 aka possibly "rds_outgoing", the generic RDS layer copies data to
269 be sent and sets header fields as needed, based on the socket API.
270 This is then queued for the individual connection and sent by the
271 connection's transport.
272 struct rds_incoming
273 a generic struct referring to incoming data that can be handed from
274 the transport to the general code and queued by the general code
275 while the socket is awoken. It is then passed back to the transport
276 code to handle the actual copy-to-user.
277 struct rds_socket
278 per-socket information
279 struct rds_connection
280 per-connection information
281 struct rds_transport
282 pointers to transport-specific functions
283 struct rds_statistics
284 non-transport-specific statistics
285 struct rds_cong_map
286 wraps the raw congestion bitmap, contains rbnode, waitq, etc.
287
288Connection management
289=====================
290
291 Connections may be in UP, DOWN, CONNECTING, DISCONNECTING, and
292 ERROR states.
293
294 The first time an attempt is made by an RDS socket to send data to
295 a node, a connection is allocated and connected. That connection is
296 then maintained forever -- if there are transport errors, the
297 connection will be dropped and re-established.
298
299 Dropping a connection while packets are queued will cause queued or
300 partially-sent datagrams to be retransmitted when the connection is
301 re-established.
302
303
304The send path
305=============
306
307 rds_sendmsg()
308 struct rds_message built from incoming data
309 CMSGs parsed (e.g. RDMA ops)
310 transport connection alloced and connected if not already
311 rds_message placed on send queue
312 send worker awoken
313 rds_send_worker()
314 calls rds_send_xmit() until queue is empty
315 rds_send_xmit()
316 transmits congestion map if one is pending
317 may set ACK_REQUIRED
318 calls transport to send either non-RDMA or RDMA message
319 (RDMA ops never retransmitted)
320 rds_ib_xmit()
321 allocs work requests from send ring
322 adds any new send credits available to peer (h_credits)
323 maps the rds_message's sg list
324 piggybacks ack
325 populates work requests
326 post send to connection's queue pair
327
328The recv path
329=============
330
331 rds_ib_recv_cq_comp_handler()
332 looks at write completions
333 unmaps recv buffer from device
334 no errors, call rds_ib_process_recv()
335 refill recv ring
336 rds_ib_process_recv()
337 validate header checksum
338 copy header to rds_ib_incoming struct if start of a new datagram
339 add to ibinc's fraglist
340 if competed datagram:
341 update cong map if datagram was cong update
342 call rds_recv_incoming() otherwise
343 note if ack is required
344 rds_recv_incoming()
345 drop duplicate packets
346 respond to pings
347 find the sock associated with this datagram
348 add to sock queue
349 wake up sock
350 do some congestion calculations
351 rds_recvmsg
352 copy data into user iovec
353 handle CMSGs
354 return to application
355
356
diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
new file mode 100644
index 000000000000..0e58b4539176
--- /dev/null
+++ b/Documentation/networking/timestamping.txt
@@ -0,0 +1,180 @@
1The existing interfaces for getting network packages time stamped are:
2
3* SO_TIMESTAMP
4 Generate time stamp for each incoming packet using the (not necessarily
5 monotonous!) system time. Result is returned via recv_msg() in a
6 control message as timeval (usec resolution).
7
8* SO_TIMESTAMPNS
9 Same time stamping mechanism as SO_TIMESTAMP, but returns result as
10 timespec (nsec resolution).
11
12* IP_MULTICAST_LOOP + SO_TIMESTAMP[NS]
13 Only for multicasts: approximate send time stamp by receiving the looped
14 packet and using its receive time stamp.
15
16The following interface complements the existing ones: receive time
17stamps can be generated and returned for arbitrary packets and much
18closer to the point where the packet is really sent. Time stamps can
19be generated in software (as before) or in hardware (if the hardware
20has such a feature).
21
22SO_TIMESTAMPING:
23
24Instructs the socket layer which kind of information is wanted. The
25parameter is an integer with some of the following bits set. Setting
26other bits is an error and doesn't change the current state.
27
28SOF_TIMESTAMPING_TX_HARDWARE: try to obtain send time stamp in hardware
29SOF_TIMESTAMPING_TX_SOFTWARE: if SOF_TIMESTAMPING_TX_HARDWARE is off or
30 fails, then do it in software
31SOF_TIMESTAMPING_RX_HARDWARE: return the original, unmodified time stamp
32 as generated by the hardware
33SOF_TIMESTAMPING_RX_SOFTWARE: if SOF_TIMESTAMPING_RX_HARDWARE is off or
34 fails, then do it in software
35SOF_TIMESTAMPING_RAW_HARDWARE: return original raw hardware time stamp
36SOF_TIMESTAMPING_SYS_HARDWARE: return hardware time stamp transformed to
37 the system time base
38SOF_TIMESTAMPING_SOFTWARE: return system time stamp generated in
39 software
40
41SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
42SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
43following control message:
44 struct scm_timestamping {
45 struct timespec systime;
46 struct timespec hwtimetrans;
47 struct timespec hwtimeraw;
48 };
49
50recvmsg() can be used to get this control message for regular incoming
51packets. For send time stamps the outgoing packet is looped back to
52the socket's error queue with the send time stamp(s) attached. It can
53be received with recvmsg(flags=MSG_ERRQUEUE). The call returns the
54original outgoing packet data including all headers preprended down to
55and including the link layer, the scm_timestamping control message and
56a sock_extended_err control message with ee_errno==ENOMSG and
57ee_origin==SO_EE_ORIGIN_TIMESTAMPING. A socket with such a pending
58bounced packet is ready for reading as far as select() is concerned.
59If the outgoing packet has to be fragmented, then only the first
60fragment is time stamped and returned to the sending socket.
61
62All three values correspond to the same event in time, but were
63generated in different ways. Each of these values may be empty (= all
64zero), in which case no such value was available. If the application
65is not interested in some of these values, they can be left blank to
66avoid the potential overhead of calculating them.
67
68systime is the value of the system time at that moment. This
69corresponds to the value also returned via SO_TIMESTAMP[NS]. If the
70time stamp was generated by hardware, then this field is
71empty. Otherwise it is filled in if SOF_TIMESTAMPING_SOFTWARE is
72set.
73
74hwtimeraw is the original hardware time stamp. Filled in if
75SOF_TIMESTAMPING_RAW_HARDWARE is set. No assumptions about its
76relation to system time should be made.
77
78hwtimetrans is the hardware time stamp transformed so that it
79corresponds as good as possible to system time. This correlation is
80not perfect; as a consequence, sorting packets received via different
81NICs by their hwtimetrans may differ from the order in which they were
82received. hwtimetrans may be non-monotonic even for the same NIC.
83Filled in if SOF_TIMESTAMPING_SYS_HARDWARE is set. Requires support
84by the network device and will be empty without that support.
85
86
87SIOCSHWTSTAMP:
88
89Hardware time stamping must also be initialized for each device driver
90that is expected to do hardware time stamping. The parameter is:
91
92struct hwtstamp_config {
93 int flags; /* no flags defined right now, must be zero */
94 int tx_type; /* HWTSTAMP_TX_* */
95 int rx_filter; /* HWTSTAMP_FILTER_* */
96};
97
98Desired behavior is passed into the kernel and to a specific device by
99calling ioctl(SIOCSHWTSTAMP) with a pointer to a struct ifreq whose
100ifr_data points to a struct hwtstamp_config. The tx_type and
101rx_filter are hints to the driver what it is expected to do. If
102the requested fine-grained filtering for incoming packets is not
103supported, the driver may time stamp more than just the requested types
104of packets.
105
106A driver which supports hardware time stamping shall update the struct
107with the actual, possibly more permissive configuration. If the
108requested packets cannot be time stamped, then nothing should be
109changed and ERANGE shall be returned (in contrast to EINVAL, which
110indicates that SIOCSHWTSTAMP is not supported at all).
111
112Only a processes with admin rights may change the configuration. User
113space is responsible to ensure that multiple processes don't interfere
114with each other and that the settings are reset.
115
116/* possible values for hwtstamp_config->tx_type */
117enum {
118 /*
119 * no outgoing packet will need hardware time stamping;
120 * should a packet arrive which asks for it, no hardware
121 * time stamping will be done
122 */
123 HWTSTAMP_TX_OFF,
124
125 /*
126 * enables hardware time stamping for outgoing packets;
127 * the sender of the packet decides which are to be
128 * time stamped by setting SOF_TIMESTAMPING_TX_SOFTWARE
129 * before sending the packet
130 */
131 HWTSTAMP_TX_ON,
132};
133
134/* possible values for hwtstamp_config->rx_filter */
135enum {
136 /* time stamp no incoming packet at all */
137 HWTSTAMP_FILTER_NONE,
138
139 /* time stamp any incoming packet */
140 HWTSTAMP_FILTER_ALL,
141
142 /* return value: time stamp all packets requested plus some others */
143 HWTSTAMP_FILTER_SOME,
144
145 /* PTP v1, UDP, any kind of event packet */
146 HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
147
148 ...
149};
150
151
152DEVICE IMPLEMENTATION
153
154A driver which supports hardware time stamping must support the
155SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
156in the skb with skb_hwtstamp_set().
157
158Time stamps for outgoing packets are to be generated as follows:
159- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
160 returns non-zero. If yes, then the driver is expected
161 to do hardware time stamping.
162- If this is possible for the skb and requested, then declare
163 that the driver is doing the time stamping by calling
164 skb_hwtstamp_tx_in_progress(). A driver not supporting
165 hardware time stamping doesn't do that. A driver must never
166 touch sk_buff::tstamp! It is used to store how time stamping
167 for an outgoing packets is to be done.
168- As soon as the driver has sent the packet and/or obtained a
169 hardware time stamp for it, it passes the time stamp back by
170 calling skb_hwtstamp_tx() with the original skb, the raw
171 hardware time stamp and a handle to the device (necessary
172 to convert the hardware time stamp to system time). If obtaining
173 the hardware time stamp somehow fails, then the driver should
174 not fall back to software time stamping. The rationale is that
175 this would occur at a later time in the processing pipeline
176 than other software time stamping and therefore could lead
177 to unexpected deltas between time stamps.
178- If the driver did not call skb_hwtstamp_tx_in_progress(), then
179 dev_hard_start_xmit() checks whether software time stamping
180 is wanted as fallback and potentially generates the time stamp.
diff --git a/Documentation/networking/timestamping/.gitignore b/Documentation/networking/timestamping/.gitignore
new file mode 100644
index 000000000000..71e81eb2e22f
--- /dev/null
+++ b/Documentation/networking/timestamping/.gitignore
@@ -0,0 +1 @@
timestamping
diff --git a/Documentation/networking/timestamping/Makefile b/Documentation/networking/timestamping/Makefile
new file mode 100644
index 000000000000..2a1489fdc036
--- /dev/null
+++ b/Documentation/networking/timestamping/Makefile
@@ -0,0 +1,6 @@
1CPPFLAGS = -I../../../include
2
3timestamping: timestamping.c
4
5clean:
6 rm -f timestamping
diff --git a/Documentation/networking/timestamping/timestamping.c b/Documentation/networking/timestamping/timestamping.c
new file mode 100644
index 000000000000..43d143104210
--- /dev/null
+++ b/Documentation/networking/timestamping/timestamping.c
@@ -0,0 +1,533 @@
1/*
2 * This program demonstrates how the various time stamping features in
3 * the Linux kernel work. It emulates the behavior of a PTP
4 * implementation in stand-alone master mode by sending PTPv1 Sync
5 * multicasts once every second. It looks for similar packets, but
6 * beyond that doesn't actually implement PTP.
7 *
8 * Outgoing packets are time stamped with SO_TIMESTAMPING with or
9 * without hardware support.
10 *
11 * Incoming packets are time stamped with SO_TIMESTAMPING with or
12 * without hardware support, SIOCGSTAMP[NS] (per-socket time stamp) and
13 * SO_TIMESTAMP[NS].
14 *
15 * Copyright (C) 2009 Intel Corporation.
16 * Author: Patrick Ohly <patrick.ohly@intel.com>
17 *
18 * This program is free software; you can redistribute it and/or modify it
19 * under the terms and conditions of the GNU General Public License,
20 * version 2, as published by the Free Software Foundation.
21 *
22 * This program is distributed in the hope it will be useful, but WITHOUT
23 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
24 * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
25 * more details.
26 *
27 * You should have received a copy of the GNU General Public License along with
28 * this program; if not, write to the Free Software Foundation, Inc.,
29 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
30 */
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <errno.h>
35#include <string.h>
36
37#include <sys/time.h>
38#include <sys/socket.h>
39#include <sys/select.h>
40#include <sys/ioctl.h>
41#include <arpa/inet.h>
42#include <net/if.h>
43
44#include "asm/types.h"
45#include "linux/net_tstamp.h"
46#include "linux/errqueue.h"
47
48#ifndef SO_TIMESTAMPING
49# define SO_TIMESTAMPING 37
50# define SCM_TIMESTAMPING SO_TIMESTAMPING
51#endif
52
53#ifndef SO_TIMESTAMPNS
54# define SO_TIMESTAMPNS 35
55#endif
56
57#ifndef SIOCGSTAMPNS
58# define SIOCGSTAMPNS 0x8907
59#endif
60
61#ifndef SIOCSHWTSTAMP
62# define SIOCSHWTSTAMP 0x89b0
63#endif
64
65static void usage(const char *error)
66{
67 if (error)
68 printf("invalid option: %s\n", error);
69 printf("timestamping interface option*\n\n"
70 "Options:\n"
71 " IP_MULTICAST_LOOP - looping outgoing multicasts\n"
72 " SO_TIMESTAMP - normal software time stamping, ms resolution\n"
73 " SO_TIMESTAMPNS - more accurate software time stamping\n"
74 " SOF_TIMESTAMPING_TX_HARDWARE - hardware time stamping of outgoing packets\n"
75 " SOF_TIMESTAMPING_TX_SOFTWARE - software fallback for outgoing packets\n"
76 " SOF_TIMESTAMPING_RX_HARDWARE - hardware time stamping of incoming packets\n"
77 " SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
78 " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
79 " SOF_TIMESTAMPING_SYS_HARDWARE - request reporting of transformed HW time stamps\n"
80 " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
81 " SIOCGSTAMP - check last socket time stamp\n"
82 " SIOCGSTAMPNS - more accurate socket time stamp\n");
83 exit(1);
84}
85
86static void bail(const char *error)
87{
88 printf("%s: %s\n", error, strerror(errno));
89 exit(1);
90}
91
92static const unsigned char sync[] = {
93 0x00, 0x01, 0x00, 0x01,
94 0x5f, 0x44, 0x46, 0x4c,
95 0x54, 0x00, 0x00, 0x00,
96 0x00, 0x00, 0x00, 0x00,
97 0x00, 0x00, 0x00, 0x00,
98 0x01, 0x01,
99
100 /* fake uuid */
101 0x00, 0x01,
102 0x02, 0x03, 0x04, 0x05,
103
104 0x00, 0x01, 0x00, 0x37,
105 0x00, 0x00, 0x00, 0x08,
106 0x00, 0x00, 0x00, 0x00,
107 0x49, 0x05, 0xcd, 0x01,
108 0x29, 0xb1, 0x8d, 0xb0,
109 0x00, 0x00, 0x00, 0x00,
110 0x00, 0x01,
111
112 /* fake uuid */
113 0x00, 0x01,
114 0x02, 0x03, 0x04, 0x05,
115
116 0x00, 0x00, 0x00, 0x37,
117 0x00, 0x00, 0x00, 0x04,
118 0x44, 0x46, 0x4c, 0x54,
119 0x00, 0x00, 0xf0, 0x60,
120 0x00, 0x01, 0x00, 0x00,
121 0x00, 0x00, 0x00, 0x01,
122 0x00, 0x00, 0xf0, 0x60,
123 0x00, 0x00, 0x00, 0x00,
124 0x00, 0x00, 0x00, 0x04,
125 0x44, 0x46, 0x4c, 0x54,
126 0x00, 0x01,
127
128 /* fake uuid */
129 0x00, 0x01,
130 0x02, 0x03, 0x04, 0x05,
131
132 0x00, 0x00, 0x00, 0x00,
133 0x00, 0x00, 0x00, 0x00,
134 0x00, 0x00, 0x00, 0x00,
135 0x00, 0x00, 0x00, 0x00
136};
137
138static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len)
139{
140 struct timeval now;
141 int res;
142
143 res = sendto(sock, sync, sizeof(sync), 0,
144 addr, addr_len);
145 gettimeofday(&now, 0);
146 if (res < 0)
147 printf("%s: %s\n", "send", strerror(errno));
148 else
149 printf("%ld.%06ld: sent %d bytes\n",
150 (long)now.tv_sec, (long)now.tv_usec,
151 res);
152}
153
154static void printpacket(struct msghdr *msg, int res,
155 char *data,
156 int sock, int recvmsg_flags,
157 int siocgstamp, int siocgstampns)
158{
159 struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name;
160 struct cmsghdr *cmsg;
161 struct timeval tv;
162 struct timespec ts;
163 struct timeval now;
164
165 gettimeofday(&now, 0);
166
167 printf("%ld.%06ld: received %s data, %d bytes from %s, %d bytes control messages\n",
168 (long)now.tv_sec, (long)now.tv_usec,
169 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
170 res,
171 inet_ntoa(from_addr->sin_addr),
172 msg->msg_controllen);
173 for (cmsg = CMSG_FIRSTHDR(msg);
174 cmsg;
175 cmsg = CMSG_NXTHDR(msg, cmsg)) {
176 printf(" cmsg len %d: ", cmsg->cmsg_len);
177 switch (cmsg->cmsg_level) {
178 case SOL_SOCKET:
179 printf("SOL_SOCKET ");
180 switch (cmsg->cmsg_type) {
181 case SO_TIMESTAMP: {
182 struct timeval *stamp =
183 (struct timeval *)CMSG_DATA(cmsg);
184 printf("SO_TIMESTAMP %ld.%06ld",
185 (long)stamp->tv_sec,
186 (long)stamp->tv_usec);
187 break;
188 }
189 case SO_TIMESTAMPNS: {
190 struct timespec *stamp =
191 (struct timespec *)CMSG_DATA(cmsg);
192 printf("SO_TIMESTAMPNS %ld.%09ld",
193 (long)stamp->tv_sec,
194 (long)stamp->tv_nsec);
195 break;
196 }
197 case SO_TIMESTAMPING: {
198 struct timespec *stamp =
199 (struct timespec *)CMSG_DATA(cmsg);
200 printf("SO_TIMESTAMPING ");
201 printf("SW %ld.%09ld ",
202 (long)stamp->tv_sec,
203 (long)stamp->tv_nsec);
204 stamp++;
205 printf("HW transformed %ld.%09ld ",
206 (long)stamp->tv_sec,
207 (long)stamp->tv_nsec);
208 stamp++;
209 printf("HW raw %ld.%09ld",
210 (long)stamp->tv_sec,
211 (long)stamp->tv_nsec);
212 break;
213 }
214 default:
215 printf("type %d", cmsg->cmsg_type);
216 break;
217 }
218 break;
219 case IPPROTO_IP:
220 printf("IPPROTO_IP ");
221 switch (cmsg->cmsg_type) {
222 case IP_RECVERR: {
223 struct sock_extended_err *err =
224 (struct sock_extended_err *)CMSG_DATA(cmsg);
225 printf("IP_RECVERR ee_errno '%s' ee_origin %d => %s",
226 strerror(err->ee_errno),
227 err->ee_origin,
228#ifdef SO_EE_ORIGIN_TIMESTAMPING
229 err->ee_origin == SO_EE_ORIGIN_TIMESTAMPING ?
230 "bounced packet" : "unexpected origin"
231#else
232 "probably SO_EE_ORIGIN_TIMESTAMPING"
233#endif
234 );
235 if (res < sizeof(sync))
236 printf(" => truncated data?!");
237 else if (!memcmp(sync, data + res - sizeof(sync),
238 sizeof(sync)))
239 printf(" => GOT OUR DATA BACK (HURRAY!)");
240 break;
241 }
242 case IP_PKTINFO: {
243 struct in_pktinfo *pktinfo =
244 (struct in_pktinfo *)CMSG_DATA(cmsg);
245 printf("IP_PKTINFO interface index %u",
246 pktinfo->ipi_ifindex);
247 break;
248 }
249 default:
250 printf("type %d", cmsg->cmsg_type);
251 break;
252 }
253 break;
254 default:
255 printf("level %d type %d",
256 cmsg->cmsg_level,
257 cmsg->cmsg_type);
258 break;
259 }
260 printf("\n");
261 }
262
263 if (siocgstamp) {
264 if (ioctl(sock, SIOCGSTAMP, &tv))
265 printf(" %s: %s\n", "SIOCGSTAMP", strerror(errno));
266 else
267 printf("SIOCGSTAMP %ld.%06ld\n",
268 (long)tv.tv_sec,
269 (long)tv.tv_usec);
270 }
271 if (siocgstampns) {
272 if (ioctl(sock, SIOCGSTAMPNS, &ts))
273 printf(" %s: %s\n", "SIOCGSTAMPNS", strerror(errno));
274 else
275 printf("SIOCGSTAMPNS %ld.%09ld\n",
276 (long)ts.tv_sec,
277 (long)ts.tv_nsec);
278 }
279}
280
281static void recvpacket(int sock, int recvmsg_flags,
282 int siocgstamp, int siocgstampns)
283{
284 char data[256];
285 struct msghdr msg;
286 struct iovec entry;
287 struct sockaddr_in from_addr;
288 struct {
289 struct cmsghdr cm;
290 char control[512];
291 } control;
292 int res;
293
294 memset(&msg, 0, sizeof(msg));
295 msg.msg_iov = &entry;
296 msg.msg_iovlen = 1;
297 entry.iov_base = data;
298 entry.iov_len = sizeof(data);
299 msg.msg_name = (caddr_t)&from_addr;
300 msg.msg_namelen = sizeof(from_addr);
301 msg.msg_control = &control;
302 msg.msg_controllen = sizeof(control);
303
304 res = recvmsg(sock, &msg, recvmsg_flags|MSG_DONTWAIT);
305 if (res < 0) {
306 printf("%s %s: %s\n",
307 "recvmsg",
308 (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
309 strerror(errno));
310 } else {
311 printpacket(&msg, res, data,
312 sock, recvmsg_flags,
313 siocgstamp, siocgstampns);
314 }
315}
316
317int main(int argc, char **argv)
318{
319 int so_timestamping_flags = 0;
320 int so_timestamp = 0;
321 int so_timestampns = 0;
322 int siocgstamp = 0;
323 int siocgstampns = 0;
324 int ip_multicast_loop = 0;
325 char *interface;
326 int i;
327 int enabled = 1;
328 int sock;
329 struct ifreq device;
330 struct ifreq hwtstamp;
331 struct hwtstamp_config hwconfig, hwconfig_requested;
332 struct sockaddr_in addr;
333 struct ip_mreq imr;
334 struct in_addr iaddr;
335 int val;
336 socklen_t len;
337 struct timeval next;
338
339 if (argc < 2)
340 usage(0);
341 interface = argv[1];
342
343 for (i = 2; i < argc; i++) {
344 if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
345 so_timestamp = 1;
346 else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
347 so_timestampns = 1;
348 else if (!strcasecmp(argv[i], "SIOCGSTAMP"))
349 siocgstamp = 1;
350 else if (!strcasecmp(argv[i], "SIOCGSTAMPNS"))
351 siocgstampns = 1;
352 else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP"))
353 ip_multicast_loop = 1;
354 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
355 so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
356 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
357 so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
358 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
359 so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
360 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
361 so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
362 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
363 so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
364 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SYS_HARDWARE"))
365 so_timestamping_flags |= SOF_TIMESTAMPING_SYS_HARDWARE;
366 else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
367 so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
368 else
369 usage(argv[i]);
370 }
371
372 sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
373 if (socket < 0)
374 bail("socket");
375
376 memset(&device, 0, sizeof(device));
377 strncpy(device.ifr_name, interface, sizeof(device.ifr_name));
378 if (ioctl(sock, SIOCGIFADDR, &device) < 0)
379 bail("getting interface IP address");
380
381 memset(&hwtstamp, 0, sizeof(hwtstamp));
382 strncpy(hwtstamp.ifr_name, interface, sizeof(hwtstamp.ifr_name));
383 hwtstamp.ifr_data = (void *)&hwconfig;
384 memset(&hwconfig, 0, sizeof(&hwconfig));
385 hwconfig.tx_type =
386 (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
387 HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
388 hwconfig.rx_filter =
389 (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
390 HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
391 hwconfig_requested = hwconfig;
392 if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) {
393 if ((errno == EINVAL || errno == ENOTSUP) &&
394 hwconfig_requested.tx_type == HWTSTAMP_TX_OFF &&
395 hwconfig_requested.rx_filter == HWTSTAMP_FILTER_NONE)
396 printf("SIOCSHWTSTAMP: disabling hardware time stamping not possible\n");
397 else
398 bail("SIOCSHWTSTAMP");
399 }
400 printf("SIOCSHWTSTAMP: tx_type %d requested, got %d; rx_filter %d requested, got %d\n",
401 hwconfig_requested.tx_type, hwconfig.tx_type,
402 hwconfig_requested.rx_filter, hwconfig.rx_filter);
403
404 /* bind to PTP port */
405 addr.sin_family = AF_INET;
406 addr.sin_addr.s_addr = htonl(INADDR_ANY);
407 addr.sin_port = htons(319 /* PTP event port */);
408 if (bind(sock,
409 (struct sockaddr *)&addr,
410 sizeof(struct sockaddr_in)) < 0)
411 bail("bind");
412
413 /* set multicast group for outgoing packets */
414 inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
415 addr.sin_addr = iaddr;
416 imr.imr_multiaddr.s_addr = iaddr.s_addr;
417 imr.imr_interface.s_addr =
418 ((struct sockaddr_in *)&device.ifr_addr)->sin_addr.s_addr;
419 if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF,
420 &imr.imr_interface.s_addr, sizeof(struct in_addr)) < 0)
421 bail("set multicast");
422
423 /* join multicast group, loop our own packet */
424 if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP,
425 &imr, sizeof(struct ip_mreq)) < 0)
426 bail("join multicast group");
427
428 if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP,
429 &ip_multicast_loop, sizeof(enabled)) < 0) {
430 bail("loop multicast");
431 }
432
433 /* set socket options for time stamping */
434 if (so_timestamp &&
435 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP,
436 &enabled, sizeof(enabled)) < 0)
437 bail("setsockopt SO_TIMESTAMP");
438
439 if (so_timestampns &&
440 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS,
441 &enabled, sizeof(enabled)) < 0)
442 bail("setsockopt SO_TIMESTAMPNS");
443
444 if (so_timestamping_flags &&
445 setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
446 &so_timestamping_flags,
447 sizeof(so_timestamping_flags)) < 0)
448 bail("setsockopt SO_TIMESTAMPING");
449
450 /* request IP_PKTINFO for debugging purposes */
451 if (setsockopt(sock, SOL_IP, IP_PKTINFO,
452 &enabled, sizeof(enabled)) < 0)
453 printf("%s: %s\n", "setsockopt IP_PKTINFO", strerror(errno));
454
455 /* verify socket options */
456 len = sizeof(val);
457 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &val, &len) < 0)
458 printf("%s: %s\n", "getsockopt SO_TIMESTAMP", strerror(errno));
459 else
460 printf("SO_TIMESTAMP %d\n", val);
461
462 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS, &val, &len) < 0)
463 printf("%s: %s\n", "getsockopt SO_TIMESTAMPNS",
464 strerror(errno));
465 else
466 printf("SO_TIMESTAMPNS %d\n", val);
467
468 if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
469 printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
470 strerror(errno));
471 } else {
472 printf("SO_TIMESTAMPING %d\n", val);
473 if (val != so_timestamping_flags)
474 printf(" not the expected value %d\n",
475 so_timestamping_flags);
476 }
477
478 /* send packets forever every five seconds */
479 gettimeofday(&next, 0);
480 next.tv_sec = (next.tv_sec + 1) / 5 * 5;
481 next.tv_usec = 0;
482 while (1) {
483 struct timeval now;
484 struct timeval delta;
485 long delta_us;
486 int res;
487 fd_set readfs, errorfs;
488
489 gettimeofday(&now, 0);
490 delta_us = (long)(next.tv_sec - now.tv_sec) * 1000000 +
491 (long)(next.tv_usec - now.tv_usec);
492 if (delta_us > 0) {
493 /* continue waiting for timeout or data */
494 delta.tv_sec = delta_us / 1000000;
495 delta.tv_usec = delta_us % 1000000;
496
497 FD_ZERO(&readfs);
498 FD_ZERO(&errorfs);
499 FD_SET(sock, &readfs);
500 FD_SET(sock, &errorfs);
501 printf("%ld.%06ld: select %ldus\n",
502 (long)now.tv_sec, (long)now.tv_usec,
503 delta_us);
504 res = select(sock + 1, &readfs, 0, &errorfs, &delta);
505 gettimeofday(&now, 0);
506 printf("%ld.%06ld: select returned: %d, %s\n",
507 (long)now.tv_sec, (long)now.tv_usec,
508 res,
509 res < 0 ? strerror(errno) : "success");
510 if (res > 0) {
511 if (FD_ISSET(sock, &readfs))
512 printf("ready for reading\n");
513 if (FD_ISSET(sock, &errorfs))
514 printf("has error\n");
515 recvpacket(sock, 0,
516 siocgstamp,
517 siocgstampns);
518 recvpacket(sock, MSG_ERRQUEUE,
519 siocgstamp,
520 siocgstampns);
521 }
522 } else {
523 /* write one packet */
524 sendpacket(sock,
525 (struct sockaddr *)&addr,
526 sizeof(addr));
527 next.tv_sec += 5;
528 continue;
529 }
530 }
531
532 return 0;
533}
diff --git a/Documentation/networking/vxge.txt b/Documentation/networking/vxge.txt
new file mode 100644
index 000000000000..d2e2997e6fa0
--- /dev/null
+++ b/Documentation/networking/vxge.txt
@@ -0,0 +1,100 @@
1Neterion's (Formerly S2io) X3100 Series 10GbE PCIe Server Adapter Linux driver
2==============================================================================
3
4Contents
5--------
6
71) Introduction
82) Features supported
93) Configurable driver parameters
104) Troubleshooting
11
121) Introduction:
13----------------
14This Linux driver supports all Neterion's X3100 series 10 GbE PCIe I/O
15Virtualized Server adapters.
16The X3100 series supports four modes of operation, configurable via
17firmware -
18 Single function mode
19 Multi function mode
20 SRIOV mode
21 MRIOV mode
22The functions share a 10GbE link and the pci-e bus, but hardly anything else
23inside the ASIC. Features like independent hw reset, statistics, bandwidth/
24priority allocation and guarantees, GRO, TSO, interrupt moderation etc are
25supported independently on each function.
26
27(See below for a complete list of features supported for both IPv4 and IPv6)
28
292) Features supported:
30----------------------
31
32i) Single function mode (up to 17 queues)
33
34ii) Multi function mode (up to 17 functions)
35
36iii) PCI-SIG's I/O Virtualization
37 - Single Root mode: v1.0 (up to 17 functions)
38 - Multi-Root mode: v1.0 (up to 17 functions)
39
40iv) Jumbo frames
41 X3100 Series supports MTU up to 9600 bytes, modifiable using
42 ifconfig command.
43
44v) Offloads supported: (Enabled by default)
45 Checksum offload (TCP/UDP/IP) on transmit and receive paths
46 TCP Segmentation Offload (TSO) on transmit path
47 Generic Receive Offload (GRO) on receive path
48
49vi) MSI-X: (Enabled by default)
50 Resulting in noticeable performance improvement (up to 7% on certain
51 platforms).
52
53vii) NAPI: (Enabled by default)
54 For better Rx interrupt moderation.
55
56viii)RTH (Receive Traffic Hash): (Enabled by default)
57 Receive side steering for better scaling.
58
59ix) Statistics
60 Comprehensive MAC-level and software statistics displayed using
61 "ethtool -S" option.
62
63x) Multiple hardware queues: (Enabled by default)
64 Up to 17 hardware based transmit and receive data channels, with
65 multiple steering options (transmit multiqueue enabled by default).
66
673) Configurable driver parameters:
68----------------------------------
69
70i) max_config_dev
71 Specifies maximum device functions to be enabled.
72 Valid range: 1-8
73
74ii) max_config_port
75 Specifies number of ports to be enabled.
76 Valid range: 1,2
77 Default: 1
78
79iii)max_config_vpath
80 Specifies maximum VPATH(s) configured for each device function.
81 Valid range: 1-17
82
83iv) vlan_tag_strip
84 Enables/disables vlan tag stripping from all received tagged frames that
85 are not replicated at the internal L2 switch.
86 Valid range: 0,1 (disabled, enabled respectively)
87 Default: 1
88
89v) addr_learn_en
90 Enable learning the mac address of the guest OS interface in
91 virtualization environment.
92 Valid range: 0,1 (disabled, enabled respectively)
93 Default: 0
94
954) Troubleshooting:
96-------------------
97
98To resolve an issue with the source code or X3100 series adapter, please collect
99the statistics, register dumps using ethool, relevant logs and email them to
100support@neterion.com.
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 0ab0230cbcb0..d16b7a1c3793 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -43,12 +43,11 @@ Table of Contents
43 2) Representing devices without a current OF specification 43 2) Representing devices without a current OF specification
44 a) PHY nodes 44 a) PHY nodes
45 b) Interrupt controllers 45 b) Interrupt controllers
46 c) CFI or JEDEC memory-mapped NOR flash 46 c) 4xx/Axon EMAC ethernet nodes
47 d) 4xx/Axon EMAC ethernet nodes 47 d) Xilinx IP cores
48 e) Xilinx IP cores 48 e) USB EHCI controllers
49 f) USB EHCI controllers 49 f) MDIO on GPIOs
50 g) MDIO on GPIOs 50 g) SPI busses
51 h) SPI busses
52 51
53 VII - Marvell Discovery mv64[345]6x System Controller chips 52 VII - Marvell Discovery mv64[345]6x System Controller chips
54 1) The /system-controller node 53 1) The /system-controller node
@@ -999,7 +998,7 @@ compatibility.
999 translation of SOC addresses for memory mapped SOC registers. 998 translation of SOC addresses for memory mapped SOC registers.
1000 - bus-frequency: Contains the bus frequency for the SOC node. 999 - bus-frequency: Contains the bus frequency for the SOC node.
1001 Typically, the value of this field is filled in by the boot 1000 Typically, the value of this field is filled in by the boot
1002 loader. 1001 loader.
1003 1002
1004 1003
1005 Recommended properties: 1004 Recommended properties:
@@ -1287,71 +1286,7 @@ platforms are moved over to use the flattened-device-tree model.
1287 device_type = "open-pic"; 1286 device_type = "open-pic";
1288 }; 1287 };
1289 1288
1290 c) CFI or JEDEC memory-mapped NOR flash 1289 c) 4xx/Axon EMAC ethernet nodes
1291
1292 Flash chips (Memory Technology Devices) are often used for solid state
1293 file systems on embedded devices.
1294
1295 - compatible : should contain the specific model of flash chip(s)
1296 used, if known, followed by either "cfi-flash" or "jedec-flash"
1297 - reg : Address range of the flash chip
1298 - bank-width : Width (in bytes) of the flash bank. Equal to the
1299 device width times the number of interleaved chips.
1300 - device-width : (optional) Width of a single flash chip. If
1301 omitted, assumed to be equal to 'bank-width'.
1302 - #address-cells, #size-cells : Must be present if the flash has
1303 sub-nodes representing partitions (see below). In this case
1304 both #address-cells and #size-cells must be equal to 1.
1305
1306 For JEDEC compatible devices, the following additional properties
1307 are defined:
1308
1309 - vendor-id : Contains the flash chip's vendor id (1 byte).
1310 - device-id : Contains the flash chip's device id (1 byte).
1311
1312 In addition to the information on the flash bank itself, the
1313 device tree may optionally contain additional information
1314 describing partitions of the flash address space. This can be
1315 used on platforms which have strong conventions about which
1316 portions of the flash are used for what purposes, but which don't
1317 use an on-flash partition table such as RedBoot.
1318
1319 Each partition is represented as a sub-node of the flash device.
1320 Each node's name represents the name of the corresponding
1321 partition of the flash device.
1322
1323 Flash partitions
1324 - reg : The partition's offset and size within the flash bank.
1325 - label : (optional) The label / name for this flash partition.
1326 If omitted, the label is taken from the node name (excluding
1327 the unit address).
1328 - read-only : (optional) This parameter, if present, is a hint to
1329 Linux that this flash partition should only be mounted
1330 read-only. This is usually used for flash partitions
1331 containing early-boot firmware images or data which should not
1332 be clobbered.
1333
1334 Example:
1335
1336 flash@ff000000 {
1337 compatible = "amd,am29lv128ml", "cfi-flash";
1338 reg = <ff000000 01000000>;
1339 bank-width = <4>;
1340 device-width = <1>;
1341 #address-cells = <1>;
1342 #size-cells = <1>;
1343 fs@0 {
1344 label = "fs";
1345 reg = <0 f80000>;
1346 };
1347 firmware@f80000 {
1348 label ="firmware";
1349 reg = <f80000 80000>;
1350 read-only;
1351 };
1352 };
1353
1354 d) 4xx/Axon EMAC ethernet nodes
1355 1290
1356 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also 1291 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
1357 the Axon bridge. To operate this needs to interact with a ths 1292 the Axon bridge. To operate this needs to interact with a ths
@@ -1499,7 +1434,7 @@ platforms are moved over to use the flattened-device-tree model.
1499 available. 1434 available.
1500 For Axon: 0x0000012a 1435 For Axon: 0x0000012a
1501 1436
1502 e) Xilinx IP cores 1437 d) Xilinx IP cores
1503 1438
1504 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use 1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
1505 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range 1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
@@ -1761,7 +1696,7 @@ platforms are moved over to use the flattened-device-tree model.
1761 listed above, nodes for these devices should include a phy-handle 1696 listed above, nodes for these devices should include a phy-handle
1762 property, and may include other common network device properties 1697 property, and may include other common network device properties
1763 like local-mac-address. 1698 like local-mac-address.
1764 1699
1765 iv) Xilinx Uartlite 1700 iv) Xilinx Uartlite
1766 1701
1767 Xilinx uartlite devices are simple fixed speed serial ports. 1702 Xilinx uartlite devices are simple fixed speed serial ports.
@@ -1793,7 +1728,7 @@ platforms are moved over to use the flattened-device-tree model.
1793 - reg-offset : A value of 3 is required 1728 - reg-offset : A value of 3 is required
1794 - reg-shift : A value of 2 is required 1729 - reg-shift : A value of 2 is required
1795 1730
1796 f) USB EHCI controllers 1731 e) USB EHCI controllers
1797 1732
1798 Required properties: 1733 Required properties:
1799 - compatible : should be "usb-ehci". 1734 - compatible : should be "usb-ehci".
@@ -1819,7 +1754,7 @@ platforms are moved over to use the flattened-device-tree model.
1819 big-endian; 1754 big-endian;
1820 }; 1755 };
1821 1756
1822 g) MDIO on GPIOs 1757 f) MDIO on GPIOs
1823 1758
1824 Currently defined compatibles: 1759 Currently defined compatibles:
1825 - virtual,gpio-mdio 1760 - virtual,gpio-mdio
@@ -1839,7 +1774,7 @@ platforms are moved over to use the flattened-device-tree model.
1839 &qe_pio_c 6>; 1774 &qe_pio_c 6>;
1840 }; 1775 };
1841 1776
1842 h) SPI (Serial Peripheral Interface) busses 1777 g) SPI (Serial Peripheral Interface) busses
1843 1778
1844 SPI busses can be described with a node for the SPI master device 1779 SPI busses can be described with a node for the SPI master device
1845 and a set of child nodes for each SPI slave on the bus. For this 1780 and a set of child nodes for each SPI slave on the bus. For this
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
1* Uploaded QE firmware 1* Uploaded QE firmware
2 2
3 If a new firwmare has been uploaded to the QE (usually by the 3 If a new firmware has been uploaded to the QE (usually by the
4 boot loader), then a 'firmware' child node should be added to the QE 4 boot loader), then a 'firmware' child node should be added to the QE
5 node. This node provides information on the uploaded firmware that 5 node. This node provides information on the uploaded firmware that
6 device drivers may need. 6 device drivers may need.
diff --git a/Documentation/powerpc/dts-bindings/fsl/dma.txt b/Documentation/powerpc/dts-bindings/fsl/dma.txt
index cc453110fc46..0732cdd05ba1 100644
--- a/Documentation/powerpc/dts-bindings/fsl/dma.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/dma.txt
@@ -35,30 +35,30 @@ Example:
35 #address-cells = <1>; 35 #address-cells = <1>;
36 #size-cells = <1>; 36 #size-cells = <1>;
37 compatible = "fsl,mpc8349-dma", "fsl,elo-dma"; 37 compatible = "fsl,mpc8349-dma", "fsl,elo-dma";
38 reg = <82a8 4>; 38 reg = <0x82a8 4>;
39 ranges = <0 8100 1a4>; 39 ranges = <0 0x8100 0x1a4>;
40 interrupt-parent = <&ipic>; 40 interrupt-parent = <&ipic>;
41 interrupts = <47 8>; 41 interrupts = <71 8>;
42 cell-index = <0>; 42 cell-index = <0>;
43 dma-channel@0 { 43 dma-channel@0 {
44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
45 cell-index = <0>; 45 cell-index = <0>;
46 reg = <0 80>; 46 reg = <0 0x80>;
47 }; 47 };
48 dma-channel@80 { 48 dma-channel@80 {
49 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 49 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
50 cell-index = <1>; 50 cell-index = <1>;
51 reg = <80 80>; 51 reg = <0x80 0x80>;
52 }; 52 };
53 dma-channel@100 { 53 dma-channel@100 {
54 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 54 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
55 cell-index = <2>; 55 cell-index = <2>;
56 reg = <100 80>; 56 reg = <0x100 0x80>;
57 }; 57 };
58 dma-channel@180 { 58 dma-channel@180 {
59 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 59 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
60 cell-index = <3>; 60 cell-index = <3>;
61 reg = <180 80>; 61 reg = <0x180 0x80>;
62 }; 62 };
63 }; 63 };
64 64
@@ -93,36 +93,36 @@ Example:
93 #address-cells = <1>; 93 #address-cells = <1>;
94 #size-cells = <1>; 94 #size-cells = <1>;
95 compatible = "fsl,mpc8540-dma", "fsl,eloplus-dma"; 95 compatible = "fsl,mpc8540-dma", "fsl,eloplus-dma";
96 reg = <21300 4>; 96 reg = <0x21300 4>;
97 ranges = <0 21100 200>; 97 ranges = <0 0x21100 0x200>;
98 cell-index = <0>; 98 cell-index = <0>;
99 dma-channel@0 { 99 dma-channel@0 {
100 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 100 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
101 reg = <0 80>; 101 reg = <0 0x80>;
102 cell-index = <0>; 102 cell-index = <0>;
103 interrupt-parent = <&mpic>; 103 interrupt-parent = <&mpic>;
104 interrupts = <14 2>; 104 interrupts = <20 2>;
105 }; 105 };
106 dma-channel@80 { 106 dma-channel@80 {
107 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 107 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
108 reg = <80 80>; 108 reg = <0x80 0x80>;
109 cell-index = <1>; 109 cell-index = <1>;
110 interrupt-parent = <&mpic>; 110 interrupt-parent = <&mpic>;
111 interrupts = <15 2>; 111 interrupts = <21 2>;
112 }; 112 };
113 dma-channel@100 { 113 dma-channel@100 {
114 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 114 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
115 reg = <100 80>; 115 reg = <0x100 0x80>;
116 cell-index = <2>; 116 cell-index = <2>;
117 interrupt-parent = <&mpic>; 117 interrupt-parent = <&mpic>;
118 interrupts = <16 2>; 118 interrupts = <22 2>;
119 }; 119 };
120 dma-channel@180 { 120 dma-channel@180 {
121 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 121 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
122 reg = <180 80>; 122 reg = <0x180 0x80>;
123 cell-index = <3>; 123 cell-index = <3>;
124 interrupt-parent = <&mpic>; 124 interrupt-parent = <&mpic>;
125 interrupts = <17 2>; 125 interrupts = <23 2>;
126 }; 126 };
127 }; 127 };
128 128
diff --git a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
new file mode 100644
index 000000000000..600846557763
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
@@ -0,0 +1,24 @@
1* Freescale Enhanced Secure Digital Host Controller (eSDHC)
2
3The Enhanced Secure Digital Host Controller provides an interface
4for MMC, SD, and SDIO types of memory cards.
5
6Required properties:
7 - compatible : should be
8 "fsl,<chip>-esdhc", "fsl,mpc8379-esdhc" for MPC83xx processors.
9 "fsl,<chip>-esdhc", "fsl,mpc8536-esdhc" for MPC85xx processors.
10 - reg : should contain eSDHC registers location and length.
11 - interrupts : should contain eSDHC interrupt.
12 - interrupt-parent : interrupt source phandle.
13 - clock-frequency : specifies eSDHC base clock frequency.
14
15Example:
16
17sdhci@2e000 {
18 compatible = "fsl,mpc8378-esdhc", "fsl,mpc8379-esdhc";
19 reg = <0x2e000 0x1000>;
20 interrupts = <42 0x8>;
21 interrupt-parent = <&ipic>;
22 /* Filled in by U-Boot */
23 clock-frequency = <0>;
24};
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
index d0ab33e21fe6..b6d2e21474f9 100644
--- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -7,8 +7,10 @@ Required properties :
7 7
8Recommended properties : 8Recommended properties :
9 9
10 - compatible : Should be "fsl-i2c" for parts compatible with 10 - compatible : compatibility list with 2 entries, the first should
11 Freescale I2C specifications. 11 be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor,
12 e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one
13 should be "fsl-i2c".
12 - interrupts : <a b> where a is the interrupt number and b is a 14 - interrupts : <a b> where a is the interrupt number and b is a
13 field that represents an encoding of the sense and level 15 field that represents an encoding of the sense and level
14 information for the interrupt. This should be encoded based on 16 information for the interrupt. This should be encoded based on
@@ -16,17 +18,31 @@ Recommended properties :
16 controller you have. 18 controller you have.
17 - interrupt-parent : the phandle for the interrupt controller that 19 - interrupt-parent : the phandle for the interrupt controller that
18 services interrupts for this device. 20 services interrupts for this device.
19 - dfsrr : boolean; if defined, indicates that this I2C device has 21 - fsl,preserve-clocking : boolean; if defined, the clock settings
20 a digital filter sampling rate register 22 from the bootloader are preserved (not touched).
21 - fsl5200-clocking : boolean; if defined, indicated that this device 23 - clock-frequency : desired I2C bus clock frequency in Hz.
22 uses the FSL 5200 clocking mechanism. 24
23 25Examples :
24Example : 26
25 i2c@3000 { 27 i2c@3d00 {
26 interrupt-parent = <40000>; 28 #address-cells = <1>;
27 interrupts = <1b 3>; 29 #size-cells = <0>;
28 reg = <3000 18>; 30 compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c";
29 device_type = "i2c"; 31 cell-index = <0>;
30 compatible = "fsl-i2c"; 32 reg = <0x3d00 0x40>;
31 dfsrr; 33 interrupts = <2 15 0>;
34 interrupt-parent = <&mpc5200_pic>;
35 fsl,preserve-clocking;
32 }; 36 };
37
38 i2c@3100 {
39 #address-cells = <1>;
40 #size-cells = <0>;
41 cell-index = <1>;
42 compatible = "fsl,mpc8544-i2c", "fsl-i2c";
43 reg = <0x3100 0x100>;
44 interrupts = <43 2>;
45 interrupt-parent = <&mpic>;
46 clock-frequency = <400000>;
47 };
48
diff --git a/Documentation/powerpc/dts-bindings/fsl/ssi.txt b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
index a2d963998a65..5ff76c9c57d2 100644
--- a/Documentation/powerpc/dts-bindings/fsl/ssi.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
@@ -4,44 +4,56 @@ The SSI is a serial device that communicates with audio codecs. It can
4be programmed in AC97, I2S, left-justified, or right-justified modes. 4be programmed in AC97, I2S, left-justified, or right-justified modes.
5 5
6Required properties: 6Required properties:
7- compatible : compatible list, containing "fsl,ssi" 7- compatible: Compatible list, contains "fsl,ssi".
8- cell-index : the SSI, <0> = SSI1, <1> = SSI2, and so on 8- cell-index: The SSI, <0> = SSI1, <1> = SSI2, and so on.
9- reg : offset and length of the register set for the device 9- reg: Offset and length of the register set for the device.
10- interrupts : <a b> where a is the interrupt number and b is a 10- interrupts: <a b> where a is the interrupt number and b is a
11 field that represents an encoding of the sense and 11 field that represents an encoding of the sense and
12 level information for the interrupt. This should be 12 level information for the interrupt. This should be
13 encoded based on the information in section 2) 13 encoded based on the information in section 2)
14 depending on the type of interrupt controller you 14 depending on the type of interrupt controller you
15 have. 15 have.
16- interrupt-parent : the phandle for the interrupt controller that 16- interrupt-parent: The phandle for the interrupt controller that
17 services interrupts for this device. 17 services interrupts for this device.
18- fsl,mode : the operating mode for the SSI interface 18- fsl,mode: The operating mode for the SSI interface.
19 "i2s-slave" - I2S mode, SSI is clock slave 19 "i2s-slave" - I2S mode, SSI is clock slave
20 "i2s-master" - I2S mode, SSI is clock master 20 "i2s-master" - I2S mode, SSI is clock master
21 "lj-slave" - left-justified mode, SSI is clock slave 21 "lj-slave" - left-justified mode, SSI is clock slave
22 "lj-master" - l.j. mode, SSI is clock master 22 "lj-master" - l.j. mode, SSI is clock master
23 "rj-slave" - right-justified mode, SSI is clock slave 23 "rj-slave" - right-justified mode, SSI is clock slave
24 "rj-master" - r.j., SSI is clock master 24 "rj-master" - r.j., SSI is clock master
25 "ac97-slave" - AC97 mode, SSI is clock slave 25 "ac97-slave" - AC97 mode, SSI is clock slave
26 "ac97-master" - AC97 mode, SSI is clock master 26 "ac97-master" - AC97 mode, SSI is clock master
27- fsl,playback-dma: phandle to a node for the DMA channel to use for 27- fsl,playback-dma: Phandle to a node for the DMA channel to use for
28 playback of audio. This is typically dictated by SOC 28 playback of audio. This is typically dictated by SOC
29 design. See the notes below. 29 design. See the notes below.
30- fsl,capture-dma: phandle to a node for the DMA channel to use for 30- fsl,capture-dma: Phandle to a node for the DMA channel to use for
31 capture (recording) of audio. This is typically dictated 31 capture (recording) of audio. This is typically dictated
32 by SOC design. See the notes below. 32 by SOC design. See the notes below.
33- fsl,fifo-depth: The number of elements in the transmit and receive FIFOs.
34 This number is the maximum allowed value for SFCSR[TFWM0].
35- fsl,ssi-asynchronous:
36 If specified, the SSI is to be programmed in asynchronous
37 mode. In this mode, pins SRCK, STCK, SRFS, and STFS must
38 all be connected to valid signals. In synchronous mode,
39 SRCK and SRFS are ignored. Asynchronous mode allows
40 playback and capture to use different sample sizes and
41 sample rates. Some drivers may require that SRCK and STCK
42 be connected together, and SRFS and STFS be connected
43 together. This would still allow different sample sizes,
44 but not different sample rates.
33 45
34Optional properties: 46Optional properties:
35- codec-handle : phandle to a 'codec' node that defines an audio 47- codec-handle: Phandle to a 'codec' node that defines an audio
36 codec connected to this SSI. This node is typically 48 codec connected to this SSI. This node is typically
37 a child of an I2C or other control node. 49 a child of an I2C or other control node.
38 50
39Child 'codec' node required properties: 51Child 'codec' node required properties:
40- compatible : compatible list, contains the name of the codec 52- compatible: Compatible list, contains the name of the codec
41 53
42Child 'codec' node optional properties: 54Child 'codec' node optional properties:
43- clock-frequency : The frequency of the input clock, which typically 55- clock-frequency: The frequency of the input clock, which typically comes
44 comes from an on-board dedicated oscillator. 56 from an on-board dedicated oscillator.
45 57
46Notes on fsl,playback-dma and fsl,capture-dma: 58Notes on fsl,playback-dma and fsl,capture-dma:
47 59
diff --git a/Documentation/powerpc/dts-bindings/fsl/tsec.txt b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
index 7fa4b27574b5..edb7ae19e868 100644
--- a/Documentation/powerpc/dts-bindings/fsl/tsec.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
@@ -56,6 +56,12 @@ Properties:
56 hardware. 56 hardware.
57 - fsl,magic-packet : If present, indicates that the hardware supports 57 - fsl,magic-packet : If present, indicates that the hardware supports
58 waking up via magic packet. 58 waking up via magic packet.
59 - bd-stash : If present, indicates that the hardware supports stashing
60 buffer descriptors in the L2.
61 - rx-stash-len : Denotes the number of bytes of a received buffer to stash
62 in the L2.
63 - rx-stash-idx : Denotes the index of the first byte from the received
64 buffer to stash in the L2.
59 65
60Example: 66Example:
61 ethernet@24000 { 67 ethernet@24000 {
diff --git a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
index 84a04d5eb8e6..a48b2cadc7f0 100644
--- a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
@@ -5,9 +5,21 @@ Required properties:
5- reg : should specify localbus chip select and size used for the chip. 5- reg : should specify localbus chip select and size used for the chip.
6- fsl,upm-addr-offset : UPM pattern offset for the address latch. 6- fsl,upm-addr-offset : UPM pattern offset for the address latch.
7- fsl,upm-cmd-offset : UPM pattern offset for the command latch. 7- fsl,upm-cmd-offset : UPM pattern offset for the command latch.
8- gpios : may specify optional GPIO connected to the Ready-Not-Busy pin.
9 8
10Example: 9Optional properties:
10- fsl,upm-wait-flags : add chip-dependent short delays after running the
11 UPM pattern (0x1), after writing a data byte (0x2) or after
12 writing out a buffer (0x4).
13- fsl,upm-addr-line-cs-offsets : address offsets for multi-chip support.
14 The corresponding address lines are used to select the chip.
15- gpios : may specify optional GPIOs connected to the Ready-Not-Busy pins
16 (R/B#). For multi-chip devices, "n" GPIO definitions are required
17 according to the number of chips.
18- chip-delay : chip dependent delay for transfering data from array to
19 read registers (tR). Required if property "gpios" is not used
20 (R/B# pins not connected).
21
22Examples:
11 23
12upm@1,0 { 24upm@1,0 {
13 compatible = "fsl,upm-nand"; 25 compatible = "fsl,upm-nand";
@@ -26,3 +38,26 @@ upm@1,0 {
26 }; 38 };
27 }; 39 };
28}; 40};
41
42upm@3,0 {
43 #address-cells = <0>;
44 #size-cells = <0>;
45 compatible = "tqc,tqm8548-upm-nand", "fsl,upm-nand";
46 reg = <3 0x0 0x800>;
47 fsl,upm-addr-offset = <0x10>;
48 fsl,upm-cmd-offset = <0x08>;
49 /* Multi-chip NAND device */
50 fsl,upm-addr-line-cs-offsets = <0x0 0x200>;
51 fsl,upm-wait-flags = <0x5>;
52 chip-delay = <25>; // in micro-seconds
53
54 nand@0 {
55 #address-cells = <1>;
56 #size-cells = <1>;
57
58 partition@0 {
59 label = "fs";
60 reg = <0x00000000 0x10000000>;
61 };
62 };
63};
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
index ff51f4c0fa9d..4fe14deedc0a 100644
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -1,15 +1,43 @@
1LED connected to GPIO 1LEDs connected to GPIO lines
2 2
3Required properties: 3Required properties:
4- compatible : should be "gpio-led". 4- compatible : should be "gpio-leds".
5- label : (optional) the label for this LED. If omitted, the label is 5
6Each LED is represented as a sub-node of the gpio-leds device. Each
7node's name represents the name of the corresponding LED.
8
9LED sub-node properties:
10- gpios : Should specify the LED's GPIO, see "Specifying GPIO information
11 for devices" in Documentation/powerpc/booting-without-of.txt. Active
12 low LEDs should be indicated using flags in the GPIO specifier.
13- label : (optional) The label for this LED. If omitted, the label is
6 taken from the node name (excluding the unit address). 14 taken from the node name (excluding the unit address).
7- gpios : should specify LED GPIO. 15- linux,default-trigger : (optional) This parameter, if present, is a
16 string defining the trigger assigned to the LED. Current triggers are:
17 "backlight" - LED will act as a back-light, controlled by the framebuffer
18 system
19 "default-on" - LED will turn on
20 "heartbeat" - LED "double" flashes at a load average based rate
21 "ide-disk" - LED indicates disk activity
22 "timer" - LED flashes at a fixed, configurable rate
8 23
9Example: 24Examples:
10 25
11led@0 { 26leds {
12 compatible = "gpio-led"; 27 compatible = "gpio-leds";
13 label = "hdd"; 28 hdd {
14 gpios = <&mcu_pio 0 1>; 29 label = "IDE Activity";
30 gpios = <&mcu_pio 0 1>; /* Active low */
31 linux,default-trigger = "ide-disk";
32 };
15}; 33};
34
35run-control {
36 compatible = "gpio-leds";
37 red {
38 gpios = <&mpc8572 6 0>;
39 };
40 green {
41 gpios = <&mpc8572 7 0>;
42 };
43}
diff --git a/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
new file mode 100644
index 000000000000..c39ac2891951
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
@@ -0,0 +1,23 @@
1MMC/SD/SDIO slot directly connected to a SPI bus
2
3Required properties:
4- compatible : should be "mmc-spi-slot".
5- reg : should specify SPI address (chip-select number).
6- spi-max-frequency : maximum frequency for this device (Hz).
7- voltage-ranges : two cells are required, first cell specifies minimum
8 slot voltage (mV), second cell specifies maximum slot voltage (mV).
9 Several ranges could be specified.
10- gpios : (optional) may specify GPIOs in this order: Card-Detect GPIO,
11 Write-Protect GPIO.
12
13Example:
14
15 mmc-slot@0 {
16 compatible = "fsl,mpc8323rdb-mmc-slot",
17 "mmc-spi-slot";
18 reg = <0>;
19 gpios = <&qe_pio_d 14 1
20 &qe_pio_d 15 0>;
21 voltage-ranges = <3300 3300>;
22 spi-max-frequency = <50000000>;
23 };
diff --git a/Documentation/powerpc/dts-bindings/mtd-physmap.txt b/Documentation/powerpc/dts-bindings/mtd-physmap.txt
new file mode 100644
index 000000000000..667c9bde8699
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/mtd-physmap.txt
@@ -0,0 +1,80 @@
1CFI or JEDEC memory-mapped NOR flash
2
3Flash chips (Memory Technology Devices) are often used for solid state
4file systems on embedded devices.
5
6 - compatible : should contain the specific model of flash chip(s)
7 used, if known, followed by either "cfi-flash" or "jedec-flash"
8 - reg : Address range(s) of the flash chip(s)
9 It's possible to (optionally) define multiple "reg" tuples so that
10 non-identical NOR chips can be described in one flash node.
11 - bank-width : Width (in bytes) of the flash bank. Equal to the
12 device width times the number of interleaved chips.
13 - device-width : (optional) Width of a single flash chip. If
14 omitted, assumed to be equal to 'bank-width'.
15 - #address-cells, #size-cells : Must be present if the flash has
16 sub-nodes representing partitions (see below). In this case
17 both #address-cells and #size-cells must be equal to 1.
18
19For JEDEC compatible devices, the following additional properties
20are defined:
21
22 - vendor-id : Contains the flash chip's vendor id (1 byte).
23 - device-id : Contains the flash chip's device id (1 byte).
24
25In addition to the information on the flash bank itself, the
26device tree may optionally contain additional information
27describing partitions of the flash address space. This can be
28used on platforms which have strong conventions about which
29portions of the flash are used for what purposes, but which don't
30use an on-flash partition table such as RedBoot.
31
32Each partition is represented as a sub-node of the flash device.
33Each node's name represents the name of the corresponding
34partition of the flash device.
35
36Flash partitions
37 - reg : The partition's offset and size within the flash bank.
38 - label : (optional) The label / name for this flash partition.
39 If omitted, the label is taken from the node name (excluding
40 the unit address).
41 - read-only : (optional) This parameter, if present, is a hint to
42 Linux that this flash partition should only be mounted
43 read-only. This is usually used for flash partitions
44 containing early-boot firmware images or data which should not
45 be clobbered.
46
47Example:
48
49 flash@ff000000 {
50 compatible = "amd,am29lv128ml", "cfi-flash";
51 reg = <ff000000 01000000>;
52 bank-width = <4>;
53 device-width = <1>;
54 #address-cells = <1>;
55 #size-cells = <1>;
56 fs@0 {
57 label = "fs";
58 reg = <0 f80000>;
59 };
60 firmware@f80000 {
61 label ="firmware";
62 reg = <f80000 80000>;
63 read-only;
64 };
65 };
66
67Here an example with multiple "reg" tuples:
68
69 flash@f0000000,0 {
70 #address-cells = <1>;
71 #size-cells = <1>;
72 compatible = "intel,PC48F4400P0VB", "cfi-flash";
73 reg = <0 0x00000000 0x02000000
74 0 0x02000000 0x02000000>;
75 bank-width = <2>;
76 partition@0 {
77 label = "test-part1";
78 reg = <0 0x04000000>;
79 };
80 };
diff --git a/Documentation/scheduler/00-INDEX b/Documentation/scheduler/00-INDEX
index aabcc3a089ba..3c00c9c3219e 100644
--- a/Documentation/scheduler/00-INDEX
+++ b/Documentation/scheduler/00-INDEX
@@ -2,8 +2,6 @@
2 - this file. 2 - this file.
3sched-arch.txt 3sched-arch.txt
4 - CPU Scheduler implementation hints for architecture specific code. 4 - CPU Scheduler implementation hints for architecture specific code.
5sched-coding.txt
6 - reference for various scheduler-related methods in the O(1) scheduler.
7sched-design-CFS.txt 5sched-design-CFS.txt
8 - goals, design and implementation of the Complete Fair Scheduler. 6 - goals, design and implementation of the Complete Fair Scheduler.
9sched-domains.txt 7sched-domains.txt
diff --git a/Documentation/scheduler/sched-coding.txt b/Documentation/scheduler/sched-coding.txt
deleted file mode 100644
index cbd8db752acf..000000000000
--- a/Documentation/scheduler/sched-coding.txt
+++ /dev/null
@@ -1,126 +0,0 @@
1 Reference for various scheduler-related methods in the O(1) scheduler
2 Robert Love <rml@tech9.net>, MontaVista Software
3
4
5Note most of these methods are local to kernel/sched.c - this is by design.
6The scheduler is meant to be self-contained and abstracted away. This document
7is primarily for understanding the scheduler, not interfacing to it. Some of
8the discussed interfaces, however, are general process/scheduling methods.
9They are typically defined in include/linux/sched.h.
10
11
12Main Scheduling Methods
13-----------------------
14
15void load_balance(runqueue_t *this_rq, int idle)
16 Attempts to pull tasks from one cpu to another to balance cpu usage,
17 if needed. This method is called explicitly if the runqueues are
18 imbalanced or periodically by the timer tick. Prior to calling,
19 the current runqueue must be locked and interrupts disabled.
20
21void schedule()
22 The main scheduling function. Upon return, the highest priority
23 process will be active.
24
25
26Locking
27-------
28
29Each runqueue has its own lock, rq->lock. When multiple runqueues need
30to be locked, lock acquires must be ordered by ascending &runqueue value.
31
32A specific runqueue is locked via
33
34 task_rq_lock(task_t pid, unsigned long *flags)
35
36which disables preemption, disables interrupts, and locks the runqueue pid is
37running on. Likewise,
38
39 task_rq_unlock(task_t pid, unsigned long *flags)
40
41unlocks the runqueue pid is running on, restores interrupts to their previous
42state, and reenables preemption.
43
44The routines
45
46 double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
47
48and
49
50 double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
51
52safely lock and unlock, respectively, the two specified runqueues. They do
53not, however, disable and restore interrupts. Users are required to do so
54manually before and after calls.
55
56
57Values
58------
59
60MAX_PRIO
61 The maximum priority of the system, stored in the task as task->prio.
62 Lower priorities are higher. Normal (non-RT) priorities range from
63 MAX_RT_PRIO to (MAX_PRIO - 1).
64MAX_RT_PRIO
65 The maximum real-time priority of the system. Valid RT priorities
66 range from 0 to (MAX_RT_PRIO - 1).
67MAX_USER_RT_PRIO
68 The maximum real-time priority that is exported to user-space. Should
69 always be equal to or less than MAX_RT_PRIO. Setting it less allows
70 kernel threads to have higher priorities than any user-space task.
71MIN_TIMESLICE
72MAX_TIMESLICE
73 Respectively, the minimum and maximum timeslices (quanta) of a process.
74
75Data
76----
77
78struct runqueue
79 The main per-CPU runqueue data structure.
80struct task_struct
81 The main per-process data structure.
82
83
84General Methods
85---------------
86
87cpu_rq(cpu)
88 Returns the runqueue of the specified cpu.
89this_rq()
90 Returns the runqueue of the current cpu.
91task_rq(pid)
92 Returns the runqueue which holds the specified pid.
93cpu_curr(cpu)
94 Returns the task currently running on the given cpu.
95rt_task(pid)
96 Returns true if pid is real-time, false if not.
97
98
99Process Control Methods
100-----------------------
101
102void set_user_nice(task_t *p, long nice)
103 Sets the "nice" value of task p to the given value.
104int setscheduler(pid_t pid, int policy, struct sched_param *param)
105 Sets the scheduling policy and parameters for the given pid.
106int set_cpus_allowed(task_t *p, unsigned long new_mask)
107 Sets a given task's CPU affinity and migrates it to a proper cpu.
108 Callers must have a valid reference to the task and assure the
109 task not exit prematurely. No locks can be held during the call.
110set_task_state(tsk, state_value)
111 Sets the given task's state to the given value.
112set_current_state(state_value)
113 Sets the current task's state to the given value.
114void set_tsk_need_resched(struct task_struct *tsk)
115 Sets need_resched in the given task.
116void clear_tsk_need_resched(struct task_struct *tsk)
117 Clears need_resched in the given task.
118void set_need_resched()
119 Sets need_resched in the current task.
120void clear_need_resched()
121 Clears need_resched in the current task.
122int need_resched()
123 Returns true if need_resched is set in the current task, false
124 otherwise.
125yield()
126 Place the current process at the end of the runqueue and call schedule.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 3ef339f491e0..5ba4d3fc625a 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,7 +126,7 @@ This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_u
126to control the CPU time reserved for each control group instead. 126to control the CPU time reserved for each control group instead.
127 127
128For more information on working with control groups, you should read 128For more information on working with control groups, you should read
129Documentation/cgroups.txt as well. 129Documentation/cgroups/cgroups.txt as well.
130 130
131Group settings are checked against the following limits in order to keep the configuration 131Group settings are checked against the following limits in order to keep the configuration
132schedulable: 132schedulable:
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
index ddace3afc83b..30f643f611b2 100644
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -60,17 +60,9 @@ Supported Cards/Chipsets
60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite) 60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite)
61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite) 61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite)
62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite) 62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite)
63 9005:0285:9005:02d8 Adaptec 5405G (Voodoo40 PM) 63 9005:0285:9005:02d8 Adaptec 5405Z (Voodoo40 BLBU)
64 9005:0285:9005:02d9 Adaptec 5445G (Voodoo44 PM) 64 9005:0285:9005:02d9 Adaptec 5445Z (Voodoo44 BLBU)
65 9005:0285:9005:02da Adaptec 5805G (Voodoo80 PM) 65 9005:0285:9005:02da Adaptec 5805Z (Voodoo80 BLBU)
66 9005:0285:9005:02db Adaptec 5085G (Voodoo08 PM)
67 9005:0285:9005:02dc Adaptec 51245G (Voodoo124 PM)
68 9005:0285:9005:02dd Adaptec 51645G (Voodoo164 PM)
69 9005:0285:9005:02de Adaptec 52445G (Voodoo244 PM)
70 9005:0285:9005:02df Adaptec ASR-2045G (Voodoo04 Lite PM)
71 9005:0285:9005:02e0 Adaptec ASR-2405G (Voodoo40 Lite PM)
72 9005:0285:9005:02e1 Adaptec ASR-2445G (Voodoo44 Lite PM)
73 9005:0285:9005:02e2 Adaptec ASR-2805G (Voodoo80 Lite PM)
74 1011:0046:9005:0364 Adaptec 5400S (Mustang) 66 1011:0046:9005:0364 Adaptec 5400S (Mustang)
75 1011:0046:9005:0365 Adaptec 5400S (Mustang) 67 1011:0046:9005:0365 Adaptec 5400S (Mustang)
76 9005:0287:9005:0800 Adaptec Themisto (Jupiter) 68 9005:0287:9005:0800 Adaptec Themisto (Jupiter)
@@ -140,6 +132,7 @@ Deanna Bonds (non-DASD support, PAE fibs and 64 bit,
140 where fibs that go to the hardware are consistently called hw_fibs and 132 where fibs that go to the hardware are consistently called hw_fibs and
141 not just fibs like the name of the driver tracking structure) 133 not just fibs like the name of the driver tracking structure)
142Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations. 134Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations.
135Achim Leubner <Achim_Leubner@adaptec.com>
143 136
144Original Driver 137Original Driver
145------------------------- 138-------------------------
diff --git a/Documentation/scsi/osd.txt b/Documentation/scsi/osd.txt
new file mode 100644
index 000000000000..da162f7fd5f5
--- /dev/null
+++ b/Documentation/scsi/osd.txt
@@ -0,0 +1,198 @@
1The OSD Standard
2================
3OSD (Object-Based Storage Device) is a T10 SCSI command set that is designed
4to provide efficient operation of input/output logical units that manage the
5allocation, placement, and accessing of variable-size data-storage containers,
6called objects. Objects are intended to contain operating system and application
7constructs. Each object has associated attributes attached to it, which are
8integral part of the object and provide metadata about the object. The standard
9defines some common obligatory attributes, but user attributes can be added as
10needed.
11
12See: http://www.t10.org/ftp/t10/drafts/osd2/ for the latest draft for OSD 2
13or search the web for "OSD SCSI"
14
15OSD in the Linux Kernel
16=======================
17osd-initiator:
18 The main component of OSD in Kernel is the osd-initiator library. Its main
19user is intended to be the pNFS-over-objects layout driver, which uses objects
20as its back-end data storage. Other clients are the other osd parts listed below.
21
22osd-uld:
23 This is a SCSI ULD that registers for OSD type devices and provides a testing
24platform, both for the in-kernel initiator as well as connected targets. It
25currently has no useful user-mode API, though it could have if need be.
26
27exofs:
28 Is an OSD based Linux file system. It uses the osd-initiator and osd-uld,
29to export a usable file system for users.
30See Documentation/filesystems/exofs.txt for more details
31
32osd target:
33 There are no current plans for an OSD target implementation in kernel. For all
34needs, a user-mode target that is based on the scsi tgt target framework is
35available from Ohio Supercomputer Center (OSC) at:
36http://www.open-osd.org/bin/view/Main/OscOsdProject
37There are several other target implementations. See http://open-osd.org for more
38links.
39
40Files and Folders
41=================
42This is the complete list of files included in this work:
43include/scsi/
44 osd_initiator.h Main API for the initiator library
45 osd_types.h Common OSD types
46 osd_sec.h Security Manager API
47 osd_protocol.h Wire definitions of the OSD standard protocol
48 osd_attributes.h Wire definitions of OSD attributes
49
50drivers/scsi/osd/
51 osd_initiator.c OSD-Initiator library implementation
52 osd_uld.c The OSD scsi ULD
53 osd_ktest.{h,c} In-kernel test suite (called by osd_uld)
54 osd_debug.h Some printk macros
55 Makefile For both in-tree and out-of-tree compilation
56 Kconfig Enables inclusion of the different pieces
57 osd_test.c User-mode application to call the kernel tests
58
59The OSD-Initiator Library
60=========================
61osd_initiator is a low level implementation of an osd initiator encoder.
62But even though, it should be intuitive and easy to use. Perhaps over time an
63higher lever will form that automates some of the more common recipes.
64
65init/fini:
66- osd_dev_init() associates a scsi_device with an osd_dev structure
67 and initializes some global pools. This should be done once per scsi_device
68 (OSD LUN). The osd_dev structure is needed for calling osd_start_request().
69
70- osd_dev_fini() cleans up before a osd_dev/scsi_device destruction.
71
72OSD commands encoding, execution, and decoding of results:
73
74struct osd_request's is used to iteratively encode an OSD command and carry
75its state throughout execution. Each request goes through these stages:
76
77a. osd_start_request() allocates the request.
78
79b. Any of the osd_req_* methods is used to encode a request of the specified
80 type.
81
82c. osd_req_add_{get,set}_attr_* may be called to add get/set attributes to the
83 CDB. "List" or "Page" mode can be used exclusively. The attribute-list API
84 can be called multiple times on the same request. However, only one
85 attribute-page can be read, as mandated by the OSD standard.
86
87d. osd_finalize_request() computes offsets into the data-in and data-out buffers
88 and signs the request using the provided capability key and integrity-
89 check parameters.
90
91e. osd_execute_request() may be called to execute the request via the block
92 layer and wait for its completion. The request can be executed
93 asynchronously by calling the block layer API directly.
94
95f. After execution, osd_req_decode_sense() can be called to decode the request's
96 sense information.
97
98g. osd_req_decode_get_attr() may be called to retrieve osd_add_get_attr_list()
99 values.
100
101h. osd_end_request() must be called to deallocate the request and any resource
102 associated with it. Note that osd_end_request cleans up the request at any
103 stage and it must always be called after a successful osd_start_request().
104
105osd_request's structure:
106
107The OSD standard defines a complex structure of IO segments pointed to by
108members in the CDB. Up to 3 segments can be deployed in the IN-Buffer and up to
1094 in the OUT-Buffer. The ASCII illustration below depicts a secure-read with
110associated get+set of attributes-lists. Other combinations very on the same
111basic theme. From no-segments-used up to all-segments-used.
112
113|________OSD-CDB__________|
114| |
115|read_len (offset=0) -|---------\
116| | |
117|get_attrs_list_length | |
118|get_attrs_list_offset -|----\ |
119| | | |
120|retrieved_attrs_alloc_len| | |
121|retrieved_attrs_offset -|----|----|-\
122| | | | |
123|set_attrs_list_length | | | |
124|set_attrs_list_offset -|-\ | | |
125| | | | | |
126|in_data_integ_offset -|-|--|----|-|-\
127|out_data_integ_offset -|-|--|--\ | | |
128\_________________________/ | | | | | |
129 | | | | | |
130|_______OUT-BUFFER________| | | | | | |
131| Set attr list |</ | | | | |
132| | | | | | |
133|-------------------------| | | | | |
134| Get attr descriptors |<---/ | | | |
135| | | | | |
136|-------------------------| | | | |
137| Out-data integrity |<------/ | | |
138| | | | |
139\_________________________/ | | |
140 | | |
141|________IN-BUFFER________| | | |
142| In-Data read |<--------/ | |
143| | | |
144|-------------------------| | |
145| Get attr list |<----------/ |
146| | |
147|-------------------------| |
148| In-data integrity |<------------/
149| |
150\_________________________/
151
152A block device request can carry bidirectional payload by means of associating
153a bidi_read request with a main write-request. Each in/out request is described
154by a chain of BIOs associated with each request.
155The CDB is of a SCSI VARLEN CDB format, as described by OSD standard.
156The OSD standard also mandates alignment restrictions at start of each segment.
157
158In the code, in struct osd_request, there are two _osd_io_info structures to
159describe the IN/OUT buffers above, two BIOs for the data payload and up to five
160_osd_req_data_segment structures to hold the different segments allocation and
161information.
162
163Important: We have chosen to disregard the assumption that a BIO-chain (and
164the resulting sg-list) describes a linear memory buffer. Meaning only first and
165last scatter chain can be incomplete and all the middle chains are of PAGE_SIZE.
166For us, a scatter-gather-list, as its name implies and as used by the Networking
167layer, is to describe a vector of buffers that will be transferred to/from the
168wire. It works very well with current iSCSI transport. iSCSI is currently the
169only deployed OSD transport. In the future we anticipate SAS and FC attached OSD
170devices as well.
171
172The OSD Testing ULD
173===================
174TODO: More user-mode control on tests.
175
176Authors, Mailing list
177=====================
178Please communicate with us on any deployment of osd, whether using this code
179or not.
180
181Any problems, questions, bug reports, lonely OSD nights, please email:
182 OSD Dev List <osd-dev@open-osd.org>
183
184More up-to-date information can be found on:
185http://open-osd.org
186
187Boaz Harrosh <bharrosh@panasas.com>
188Benny Halevy <bhalevy@panasas.com>
189
190References
191==========
192Weber, R., "SCSI Object-Based Storage Device Commands",
193T10/1355-D ANSI/INCITS 400-2004,
194http://www.t10.org/ftp/t10/drafts/osd/osd-r10.pdf
195
196Weber, R., "SCSI Object-Based Storage Device Commands -2 (OSD-2)"
197T10/1729-D, Working Draft, rev. 3
198http://www.t10.org/ftp/t10/drafts/osd2/osd2r03.pdf
diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
1 ====================================
2 SLOW WORK ITEM EXECUTION THREAD POOL
3 ====================================
4
5By: David Howells <dhowells@redhat.com>
6
7The slow work item execution thread pool is a pool of threads for performing
8things that take a relatively long time, such as making mkdir calls.
9Typically, when processing something, these items will spend a lot of time
10blocking a thread on I/O, thus making that thread unavailable for doing other
11work.
12
13The standard workqueue model is unsuitable for this class of work item as that
14limits the owner to a single thread or a single thread per CPU. For some
15tasks, however, more threads - or fewer - are required.
16
17There is just one pool per system. It contains no threads unless something
18wants to use it - and that something must register its interest first. When
19the pool is active, the number of threads it contains is dynamic, varying
20between a maximum and minimum setting, depending on the load.
21
22
23====================
24CLASSES OF WORK ITEM
25====================
26
27This pool support two classes of work items:
28
29 (*) Slow work items.
30
31 (*) Very slow work items.
32
33The former are expected to finish much quicker than the latter.
34
35An operation of the very slow class may do a batch combination of several
36lookups, mkdirs, and a create for instance.
37
38An operation of the ordinarily slow class may, for example, write stuff or
39expand files, provided the time taken to do so isn't too long.
40
41Operations of both types may sleep during execution, thus tying up the thread
42loaned to it.
43
44
45THREAD-TO-CLASS ALLOCATION
46--------------------------
47
48Not all the threads in the pool are available to work on very slow work items.
49The number will be between one and one fewer than the number of active threads.
50This is configurable (see the "Pool Configuration" section).
51
52All the threads are available to work on ordinarily slow work items, but a
53percentage of the threads will prefer to work on very slow work items.
54
55The configuration ensures that at least one thread will be available to work on
56very slow work items, and at least one thread will be available that won't work
57on very slow work items at all.
58
59
60=====================
61USING SLOW WORK ITEMS
62=====================
63
64Firstly, a module or subsystem wanting to make use of slow work items must
65register its interest:
66
67 int ret = slow_work_register_user();
68
69This will return 0 if successful, or a -ve error upon failure.
70
71
72Slow work items may then be set up by:
73
74 (1) Declaring a slow_work struct type variable:
75
76 #include <linux/slow-work.h>
77
78 struct slow_work myitem;
79
80 (2) Declaring the operations to be used for this item:
81
82 struct slow_work_ops myitem_ops = {
83 .get_ref = myitem_get_ref,
84 .put_ref = myitem_put_ref,
85 .execute = myitem_execute,
86 };
87
88 [*] For a description of the ops, see section "Item Operations".
89
90 (3) Initialising the item:
91
92 slow_work_init(&myitem, &myitem_ops);
93
94 or:
95
96 vslow_work_init(&myitem, &myitem_ops);
97
98 depending on its class.
99
100A suitably set up work item can then be enqueued for processing:
101
102 int ret = slow_work_enqueue(&myitem);
103
104This will return a -ve error if the thread pool is unable to gain a reference
105on the item, 0 otherwise.
106
107
108The items are reference counted, so there ought to be no need for a flush
109operation. When all a module's slow work items have been processed, and the
110module has no further interest in the facility, it should unregister its
111interest:
112
113 slow_work_unregister_user();
114
115
116===============
117ITEM OPERATIONS
118===============
119
120Each work item requires a table of operations of type struct slow_work_ops.
121All members are required:
122
123 (*) Get a reference on an item:
124
125 int (*get_ref)(struct slow_work *work);
126
127 This allows the thread pool to attempt to pin an item by getting a
128 reference on it. This function should return 0 if the reference was
129 granted, or a -ve error otherwise. If an error is returned,
130 slow_work_enqueue() will fail.
131
132 The reference is held whilst the item is queued and whilst it is being
133 executed. The item may then be requeued with the same reference held, or
134 the reference will be released.
135
136 (*) Release a reference on an item:
137
138 void (*put_ref)(struct slow_work *work);
139
140 This allows the thread pool to unpin an item by releasing the reference on
141 it. The thread pool will not touch the item again once this has been
142 called.
143
144 (*) Execute an item:
145
146 void (*execute)(struct slow_work *work);
147
148 This should perform the work required of the item. It may sleep, it may
149 perform disk I/O and it may wait for locks.
150
151
152==================
153POOL CONFIGURATION
154==================
155
156The slow-work thread pool has a number of configurables:
157
158 (*) /proc/sys/kernel/slow-work/min-threads
159
160 The minimum number of threads that should be in the pool whilst it is in
161 use. This may be anywhere between 2 and max-threads.
162
163 (*) /proc/sys/kernel/slow-work/max-threads
164
165 The maximum number of threads that should in the pool. This may be
166 anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
167
168 (*) /proc/sys/kernel/slow-work/vslow-percentage
169
170 The percentage of active threads in the pool that may be used to execute
171 very slow work items. This may be between 1 and 99. The resultant number
172 is bounded to between 1 and one fewer than the number of active threads.
173 This ensures there is always at least one thread that can process very
174 slow work items, and always at least one thread that won't.
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 841a9365d5fd..012858d2b119 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -346,6 +346,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
346 sbirq - IRQ # for CMI8330 chip (SB16) 346 sbirq - IRQ # for CMI8330 chip (SB16)
347 sbdma8 - 8bit DMA # for CMI8330 chip (SB16) 347 sbdma8 - 8bit DMA # for CMI8330 chip (SB16)
348 sbdma16 - 16bit DMA # for CMI8330 chip (SB16) 348 sbdma16 - 16bit DMA # for CMI8330 chip (SB16)
349 fmport - (optional) OPL3 I/O port
350 mpuport - (optional) MPU401 I/O port
351 mpuirq - (optional) MPU401 irq #
349 352
350 This module supports multiple cards and autoprobe. 353 This module supports multiple cards and autoprobe.
351 354
@@ -388,34 +391,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
388 391
389 The power-management is supported. 392 The power-management is supported.
390 393
391 Module snd-cs4232
392 -----------------
393
394 Module for sound cards based on CS4232/CS4232A ISA chips.
395
396 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
397
398 with isapnp=0, the following options are available:
399
400 port - port # for CS4232 chip (PnP setup - 0x534)
401 cport - control port # for CS4232 chip (PnP setup - 0x120,0x210,0xf00)
402 mpu_port - port # for MPU-401 UART (PnP setup - 0x300), -1 = disable
403 fm_port - FM port # for CS4232 chip (PnP setup - 0x388), -1 = disable
404 irq - IRQ # for CS4232 chip (5,7,9,11,12,15)
405 mpu_irq - IRQ # for MPU-401 UART (9,11,12,15)
406 dma1 - first DMA # for CS4232 chip (0,1,3)
407 dma2 - second DMA # for Yamaha CS4232 chip (0,1,3), -1 = disable
408
409 This module supports multiple cards. This module does not support autoprobe
410 (if ISA PnP is not used) thus main port must be specified!!! Other ports are
411 optional.
412
413 The power-management is supported.
414
415 Module snd-cs4236 394 Module snd-cs4236
416 ----------------- 395 -----------------
417 396
418 Module for sound cards based on CS4235/CS4236/CS4236B/CS4237B/ 397 Module for sound cards based on CS4232/CS4232A,
398 CS4235/CS4236/CS4236B/CS4237B/
419 CS4238B/CS4239 ISA chips. 399 CS4238B/CS4239 ISA chips.
420 400
421 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default) 401 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
@@ -437,6 +417,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
437 417
438 The power-management is supported. 418 The power-management is supported.
439 419
420 This module is aliased as snd-cs4232 since it provides the old
421 snd-cs4232 functionality, too.
422
440 Module snd-cs4281 423 Module snd-cs4281
441 ----------------- 424 -----------------
442 425
@@ -606,6 +589,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
606 Module for ESS AudioDrive ES-1688 and ES-688 sound cards. 589 Module for ESS AudioDrive ES-1688 and ES-688 sound cards.
607 590
608 port - port # for ES-1688 chip (0x220,0x240,0x260) 591 port - port # for ES-1688 chip (0x220,0x240,0x260)
592 fm_port - port # for OPL3 (option; share the same port as default)
609 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default) 593 mpu_port - port # for MPU-401 port (0x300,0x310,0x320,0x330), -1 = disable (default)
610 irq - IRQ # for ES-1688 chip (5,7,9,10) 594 irq - IRQ # for ES-1688 chip (5,7,9,10)
611 mpu_irq - IRQ # for MPU-401 port (5,7,9,10) 595 mpu_irq - IRQ # for MPU-401 port (5,7,9,10)
@@ -757,6 +741,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
757 model - force the model name 741 model - force the model name
758 position_fix - Fix DMA pointer (0 = auto, 1 = use LPIB, 2 = POSBUF) 742 position_fix - Fix DMA pointer (0 = auto, 1 = use LPIB, 2 = POSBUF)
759 probe_mask - Bitmask to probe codecs (default = -1, meaning all slots) 743 probe_mask - Bitmask to probe codecs (default = -1, meaning all slots)
744 When the bit 8 (0x100) is set, the lower 8 bits are used
745 as the "fixed" codec slots; i.e. the driver probes the
746 slots regardless what hardware reports back
760 probe_only - Only probing and no codec initialization (default=off); 747 probe_only - Only probing and no codec initialization (default=off);
761 Useful to check the initial codec status for debugging 748 Useful to check the initial codec status for debugging
762 bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. 749 bdl_pos_adj - Specifies the DMA IRQ timing delay in samples.
@@ -1185,6 +1172,54 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1185 1172
1186 This module supports multiple devices and PnP. 1173 This module supports multiple devices and PnP.
1187 1174
1175 Module snd-msnd-classic
1176 -----------------------
1177
1178 Module for Turtle Beach MultiSound Classic, Tahiti or Monterey
1179 soundcards.
1180
1181 io - Port # for msnd-classic card
1182 irq - IRQ # for msnd-classic card
1183 mem - Memory address (0xb0000, 0xc8000, 0xd0000, 0xd8000,
1184 0xe0000 or 0xe8000)
1185 write_ndelay - enable write ndelay (default = 1)
1186 calibrate_signal - calibrate signal (default = 0)
1187 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
1188 digital - Digital daughterboard present (default = 0)
1189 cfg - Config port (0x250, 0x260 or 0x270) default = PnP
1190 reset - Reset all devices
1191 mpu_io - MPU401 I/O port
1192 mpu_irq - MPU401 irq#
1193 ide_io0 - IDE port #0
1194 ide_io1 - IDE port #1
1195 ide_irq - IDE irq#
1196 joystick_io - Joystick I/O port
1197
1198 The driver requires firmware files "turtlebeach/msndinit.bin" and
1199 "turtlebeach/msndperm.bin" in the proper firmware directory.
1200
1201 See Documentation/sound/oss/MultiSound for important information
1202 about this driver. Note that it has been discontinued, but the
1203 Voyetra Turtle Beach knowledge base entry for it is still available
1204 at
1205 http://www.turtlebeach.com/site/kb_ftp/790.asp
1206
1207 Module snd-msnd-pinnacle
1208 ------------------------
1209
1210 Module for Turtle Beach MultiSound Pinnacle/Fiji soundcards.
1211
1212 io - Port # for pinnacle/fiji card
1213 irq - IRQ # for pinnalce/fiji card
1214 mem - Memory address (0xb0000, 0xc8000, 0xd0000, 0xd8000,
1215 0xe0000 or 0xe8000)
1216 write_ndelay - enable write ndelay (default = 1)
1217 calibrate_signal - calibrate signal (default = 0)
1218 isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
1219
1220 The driver requires firmware files "turtlebeach/pndspini.bin" and
1221 "turtlebeach/pndsperm.bin" in the proper firmware directory.
1222
1188 Module snd-mtpav 1223 Module snd-mtpav
1189 ---------------- 1224 ----------------
1190 1225
@@ -1824,7 +1859,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1824 ------------------- 1859 -------------------
1825 1860
1826 Module for sound cards based on the Asus AV100/AV200 chips, 1861 Module for sound cards based on the Asus AV100/AV200 chips,
1827 i.e., Xonar D1, DX, D2, D2X and HDAV1.3 (Deluxe). 1862 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), and Essence STX.
1828 1863
1829 This module supports autoprobe and multiple cards. 1864 This module supports autoprobe and multiple cards.
1830 1865
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 0f5d26bea80f..8eec05bc079e 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -56,6 +56,7 @@ ALC262
56 sony-assamd Sony ASSAMD 56 sony-assamd Sony ASSAMD
57 toshiba-s06 Toshiba S06 57 toshiba-s06 Toshiba S06
58 toshiba-rx1 Toshiba RX1 58 toshiba-rx1 Toshiba RX1
59 tyan Tyan Thunder n6650W (S2915-E)
59 ultra Samsung Q1 Ultra Vista model 60 ultra Samsung Q1 Ultra Vista model
60 lenovo-3000 Lenovo 3000 y410 61 lenovo-3000 Lenovo 3000 y410
61 nec NEC Versa S9100 62 nec NEC Versa S9100
@@ -261,6 +262,8 @@ Conexant 5051
261============= 262=============
262 laptop Basic Laptop config (default) 263 laptop Basic Laptop config (default)
263 hp HP Spartan laptop 264 hp HP Spartan laptop
265 hp-dv6736 HP dv6736
266 lenovo-x200 Lenovo X200 laptop
264 267
265STAC9200 268STAC9200
266======== 269========
@@ -278,6 +281,7 @@ STAC9200
278 gateway-m4 Gateway laptops with EAPD control 281 gateway-m4 Gateway laptops with EAPD control
279 gateway-m4-2 Gateway laptops with EAPD control 282 gateway-m4-2 Gateway laptops with EAPD control
280 panasonic Panasonic CF-74 283 panasonic Panasonic CF-74
284 auto BIOS setup (default)
281 285
282STAC9205/9254 286STAC9205/9254
283============= 287=============
@@ -285,6 +289,8 @@ STAC9205/9254
285 dell-m42 Dell (unknown) 289 dell-m42 Dell (unknown)
286 dell-m43 Dell Precision 290 dell-m43 Dell Precision
287 dell-m44 Dell Inspiron 291 dell-m44 Dell Inspiron
292 eapd Keep EAPD on (e.g. Gateway T1616)
293 auto BIOS setup (default)
288 294
289STAC9220/9221 295STAC9220/9221
290============= 296=============
@@ -308,6 +314,7 @@ STAC9220/9221
308 dell-d82 Dell (unknown) 314 dell-d82 Dell (unknown)
309 dell-m81 Dell (unknown) 315 dell-m81 Dell (unknown)
310 dell-m82 Dell XPS M1210 316 dell-m82 Dell XPS M1210
317 auto BIOS setup (default)
311 318
312STAC9202/9250/9251 319STAC9202/9250/9251
313================== 320==================
@@ -319,6 +326,7 @@ STAC9202/9250/9251
319 m3 Some Gateway MX series laptops 326 m3 Some Gateway MX series laptops
320 m5 Some Gateway MX series laptops (MP6954) 327 m5 Some Gateway MX series laptops (MP6954)
321 m6 Some Gateway NX series laptops 328 m6 Some Gateway NX series laptops
329 auto BIOS setup (default)
322 330
323STAC9227/9228/9229/927x 331STAC9227/9228/9229/927x
324======================= 332=======================
@@ -328,6 +336,7 @@ STAC9227/9228/9229/927x
328 5stack D965 5stack + SPDIF 336 5stack D965 5stack + SPDIF
329 dell-3stack Dell Dimension E520 337 dell-3stack Dell Dimension E520
330 dell-bios Fixes with Dell BIOS setup 338 dell-bios Fixes with Dell BIOS setup
339 auto BIOS setup (default)
331 340
332STAC92HD71B* 341STAC92HD71B*
333============ 342============
@@ -335,7 +344,10 @@ STAC92HD71B*
335 dell-m4-1 Dell desktops 344 dell-m4-1 Dell desktops
336 dell-m4-2 Dell desktops 345 dell-m4-2 Dell desktops
337 dell-m4-3 Dell desktops 346 dell-m4-3 Dell desktops
338 hp-m4 HP dv laptops 347 hp-m4 HP mini 1000
348 hp-dv5 HP dv series
349 hp-hdx HP HDX series
350 auto BIOS setup (default)
339 351
340STAC92HD73* 352STAC92HD73*
341=========== 353===========
@@ -345,13 +357,16 @@ STAC92HD73*
345 dell-m6-dmic Dell desktops/laptops with digital mics 357 dell-m6-dmic Dell desktops/laptops with digital mics
346 dell-m6 Dell desktops/laptops with both type of mics 358 dell-m6 Dell desktops/laptops with both type of mics
347 dell-eq Dell desktops/laptops 359 dell-eq Dell desktops/laptops
360 auto BIOS setup (default)
348 361
349STAC92HD83* 362STAC92HD83*
350=========== 363===========
351 ref Reference board 364 ref Reference board
352 mic-ref Reference board with power managment for ports 365 mic-ref Reference board with power managment for ports
366 dell-s14 Dell laptop
367 auto BIOS setup (default)
353 368
354STAC9872 369STAC9872
355======== 370========
356 vaio Setup for VAIO FE550G/SZ110 371 vaio VAIO laptop without SPDIF
357 vaio-ar Setup for VAIO AR 372 auto BIOS setup (default)
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 8d68fff71839..88b7433d2f11 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -109,6 +109,13 @@ slot, pass `probe_mask=1`. For the first and the third slots, pass
109Since 2.6.29 kernel, the driver has a more robust probing method, so 109Since 2.6.29 kernel, the driver has a more robust probing method, so
110this error might happen rarely, though. 110this error might happen rarely, though.
111 111
112On a machine with a broken BIOS, sometimes you need to force the
113driver to probe the codec slots the hardware doesn't report for use.
114In such a case, turn the bit 8 (0x100) of `probe_mask` option on.
115Then the rest 8 bits are passed as the codec slots to probe
116unconditionally. For example, `probe_mask=0x103` will force to probe
117the codec slots 0 and 1 no matter what the hardware reports.
118
112 119
113Interrupt Handling 120Interrupt Handling
114~~~~~~~~~~~~~~~~~~ 121~~~~~~~~~~~~~~~~~~
@@ -162,7 +169,7 @@ PCI SSID look-up.
162What `model` option values are available depends on the codec chip. 169What `model` option values are available depends on the codec chip.
163Check your codec chip from the codec proc file (see "Codec Proc-File" 170Check your codec chip from the codec proc file (see "Codec Proc-File"
164section below). It will show the vendor/product name of your codec 171section below). It will show the vendor/product name of your codec
165chip. Then, see Documentation/sound/alsa/HD-Audio-Modelstxt file, 172chip. Then, see Documentation/sound/alsa/HD-Audio-Models.txt file,
166the section of HD-audio driver. You can find a list of codecs 173the section of HD-audio driver. You can find a list of codecs
167and `model` options belonging to each codec. For example, for Realtek 174and `model` options belonging to each codec. For example, for Realtek
168ALC262 codec chip, pass `model=ultra` for devices that are compatible 175ALC262 codec chip, pass `model=ultra` for devices that are compatible
@@ -170,7 +177,7 @@ with Samsung Q1 Ultra.
170 177
171Thus, the first thing you can do for any brand-new, unsupported and 178Thus, the first thing you can do for any brand-new, unsupported and
172non-working HD-audio hardware is to check HD-audio codec and several 179non-working HD-audio hardware is to check HD-audio codec and several
173different `model` option values. If you have a luck, some of them 180different `model` option values. If you have any luck, some of them
174might suit with your device well. 181might suit with your device well.
175 182
176Some codecs such as ALC880 have a special model option `model=test`. 183Some codecs such as ALC880 have a special model option `model=test`.
@@ -358,10 +365,26 @@ modelname::
358 to this file. 365 to this file.
359init_verbs:: 366init_verbs::
360 The extra verbs to execute at initialization. You can add a verb by 367 The extra verbs to execute at initialization. You can add a verb by
361 writing to this file. Pass tree numbers, nid, verb and parameter. 368 writing to this file. Pass three numbers: nid, verb and parameter
369 (separated with a space).
362hints:: 370hints::
363 Shows hint strings for codec parsers for any use. Right now it's 371 Shows / stores hint strings for codec parsers for any use.
364 not used. 372 Its format is `key = value`. For example, passing `hp_detect = yes`
373 to IDT/STAC codec parser will result in the disablement of the
374 headphone detection.
375init_pin_configs::
376 Shows the initial pin default config values set by BIOS.
377driver_pin_configs::
378 Shows the pin default values set by the codec parser explicitly.
379 This doesn't show all pin values but only the changed values by
380 the parser. That is, if the parser doesn't change the pin default
381 config values by itself, this will contain nothing.
382user_pin_configs::
383 Shows the pin default config values to override the BIOS setup.
384 Writing this (with two numbers, NID and value) appends the new
385 value. The given will be used instead of the initial BIOS value at
386 the next reconfiguration time. Note that this config will override
387 even the driver pin configs, too.
365reconfig:: 388reconfig::
366 Triggers the codec re-configuration. When any value is written to 389 Triggers the codec re-configuration. When any value is written to
367 this file, the driver re-initialize and parses the codec tree 390 this file, the driver re-initialize and parses the codec tree
@@ -371,6 +394,14 @@ clear::
371 Resets the codec, removes the mixer elements and PCM stuff of the 394 Resets the codec, removes the mixer elements and PCM stuff of the
372 specified codec, and clear all init verbs and hints. 395 specified codec, and clear all init verbs and hints.
373 396
397For example, when you want to change the pin default configuration
398value of the pin widget 0x14 to 0x9993013f, and let the driver
399re-configure based on that state, run like below:
400------------------------------------------------------------------------
401 # echo 0x14 0x9993013f > /sys/class/sound/hwC0D0/user_pin_configs
402 # echo 1 > /sys/class/sound/hwC0D0/reconfig
403------------------------------------------------------------------------
404
374 405
375Power-Saving 406Power-Saving
376~~~~~~~~~~~~ 407~~~~~~~~~~~~
@@ -461,6 +492,16 @@ run with `--no-upload` option, and attach the generated file.
461There are some other useful options. See `--help` option output for 492There are some other useful options. See `--help` option output for
462details. 493details.
463 494
495When a probe error occurs or when the driver obviously assigns a
496mismatched model, it'd be helpful to load the driver with
497`probe_only=1` option (at best after the cold reboot) and run
498alsa-info at this state. With this option, the driver won't configure
499the mixer and PCM but just tries to probe the codec slot. After
500probing, the proc file is available, so you can get the raw codec
501information before modified by the driver. Of course, the driver
502isn't usable with `probe_only=1`. But you can continue the
503configuration via hwdep sysfs file if hda-reconfig option is enabled.
504
464 505
465hda-verb 506hda-verb
466~~~~~~~~ 507~~~~~~~~
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 46f9684d0b29..9e6763264a2e 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -116,6 +116,9 @@ SOC_DAPM_SINGLE("HiFi Playback Switch", WM8731_APANA, 4, 1, 0),
116SND_SOC_DAPM_MIXER("Output Mixer", WM8731_PWR, 4, 1, wm8731_output_mixer_controls, 116SND_SOC_DAPM_MIXER("Output Mixer", WM8731_PWR, 4, 1, wm8731_output_mixer_controls,
117 ARRAY_SIZE(wm8731_output_mixer_controls)), 117 ARRAY_SIZE(wm8731_output_mixer_controls)),
118 118
119If you dont want the mixer elements prefixed with the name of the mixer widget,
120you can use SND_SOC_DAPM_MIXER_NAMED_CTL instead. the parameters are the same
121as for SND_SOC_DAPM_MIXER.
119 122
1202.3 Platform/Machine domain Widgets 1232.3 Platform/Machine domain Widgets
121----------------------------------- 124-----------------------------------
diff --git a/Documentation/sound/alsa/soc/jack.txt b/Documentation/sound/alsa/soc/jack.txt
new file mode 100644
index 000000000000..fcf82a417293
--- /dev/null
+++ b/Documentation/sound/alsa/soc/jack.txt
@@ -0,0 +1,71 @@
1ASoC jack detection
2===================
3
4ALSA has a standard API for representing physical jacks to user space,
5the kernel side of which can be seen in include/sound/jack.h. ASoC
6provides a version of this API adding two additional features:
7
8 - It allows more than one jack detection method to work together on one
9 user visible jack. In embedded systems it is common for multiple
10 to be present on a single jack but handled by separate bits of
11 hardware.
12
13 - Integration with DAPM, allowing DAPM endpoints to be updated
14 automatically based on the detected jack status (eg, turning off the
15 headphone outputs if no headphones are present).
16
17This is done by splitting the jacks up into three things working
18together: the jack itself represented by a struct snd_soc_jack, sets of
19snd_soc_jack_pins representing DAPM endpoints to update and blocks of
20code providing jack reporting mechanisms.
21
22For example, a system may have a stereo headset jack with two reporting
23mechanisms, one for the headphone and one for the microphone. Some
24systems won't be able to use their speaker output while a headphone is
25connected and so will want to make sure to update both speaker and
26headphone when the headphone jack status changes.
27
28The jack - struct snd_soc_jack
29==============================
30
31This represents a physical jack on the system and is what is visible to
32user space. The jack itself is completely passive, it is set up by the
33machine driver and updated by jack detection methods.
34
35Jacks are created by the machine driver calling snd_soc_jack_new().
36
37snd_soc_jack_pin
38================
39
40These represent a DAPM pin to update depending on some of the status
41bits supported by the jack. Each snd_soc_jack has zero or more of these
42which are updated automatically. They are created by the machine driver
43and associated with the jack using snd_soc_jack_add_pins(). The status
44of the endpoint may configured to be the opposite of the jack status if
45required (eg, enabling a built in microphone if a microphone is not
46connected via a jack).
47
48Jack detection methods
49======================
50
51Actual jack detection is done by code which is able to monitor some
52input to the system and update a jack by calling snd_soc_jack_report(),
53specifying a subset of bits to update. The jack detection code should
54be set up by the machine driver, taking configuration for the jack to
55update and the set of things to report when the jack is connected.
56
57Often this is done based on the status of a GPIO - a handler for this is
58provided by the snd_soc_jack_add_gpio() function. Other methods are
59also available, for example integrated into CODECs. One example of
60CODEC integrated jack detection can be see in the WM8350 driver.
61
62Each jack may have multiple reporting mechanisms, though it will need at
63least one to be useful.
64
65Machine drivers
66===============
67
68These are all hooked together by the machine driver depending on the
69system hardware. The machine driver will set up the snd_soc_jack and
70the list of pins to update then set up one or more jack detection
71mechanisms to update that jack based on their current status.
diff --git a/Documentation/sound/oss/CS4232 b/Documentation/sound/oss/CS4232
deleted file mode 100644
index 7d6af7a5c1c2..000000000000
--- a/Documentation/sound/oss/CS4232
+++ /dev/null
@@ -1,23 +0,0 @@
1To configure the Crystal CS423x sound chip and activate its DSP functions,
2modules may be loaded in this order:
3
4 modprobe sound
5 insmod ad1848
6 insmod uart401
7 insmod cs4232 io=* irq=* dma=* dma2=*
8
9This is the meaning of the parameters:
10
11 io--I/O address of the Windows Sound System (normally 0x534)
12 irq--IRQ of this device
13 dma and dma2--DMA channels (DMA2 may be 0)
14
15On some cards, the board attempts to do non-PnP setup, and fails. If you
16have problems, use Linux' PnP facilities.
17
18To get MIDI facilities add
19
20 insmod opl3 io=*
21
22where "io" is the I/O address of the OPL3 synthesizer. This will be shown
23in /proc/sys/pnp and is normally 0x388.
diff --git a/Documentation/sound/oss/Introduction b/Documentation/sound/oss/Introduction
index f04ba6bb7395..75d967ff9266 100644
--- a/Documentation/sound/oss/Introduction
+++ b/Documentation/sound/oss/Introduction
@@ -80,7 +80,7 @@ Notes:
80 additional features. 80 additional features.
81 81
822. The commercial OSS driver may be obtained from the site: 822. The commercial OSS driver may be obtained from the site:
83 http://www/opensound.com. This may be used for cards that 83 http://www.opensound.com. This may be used for cards that
84 are unsupported by the kernel driver, or may be used 84 are unsupported by the kernel driver, or may be used
85 by other operating systems. 85 by other operating systems.
86 86
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 42f43fa59f24..34c76a55bc04 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian
42vs cpu-endian vs whatever), and there the constant "0" really _is_ 42vs cpu-endian vs whatever), and there the constant "0" really _is_
43special. 43special.
44 44
45__bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that
46is mostly warning-free and is supposed to stay that way. Warnings will
47be generated without __CHECK_ENDIAN__.
48
49__bitwise - noisy stuff; in particular, __le*/__be* are that. We really
50don't want to drown in noise unless we'd explicitly asked for it.
51
52
45Getting sparse 53Getting sparse
46~~~~~~~~~~~~~~ 54~~~~~~~~~~~~~~
47 55
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 0f5122eb282b..4a02d2508bc8 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -511,10 +511,16 @@ SPI MASTER METHODS
511 This sets up the device clock rate, SPI mode, and word sizes. 511 This sets up the device clock rate, SPI mode, and word sizes.
512 Drivers may change the defaults provided by board_info, and then 512 Drivers may change the defaults provided by board_info, and then
513 call spi_setup(spi) to invoke this routine. It may sleep. 513 call spi_setup(spi) to invoke this routine. It may sleep.
514
514 Unless each SPI slave has its own configuration registers, don't 515 Unless each SPI slave has its own configuration registers, don't
515 change them right away ... otherwise drivers could corrupt I/O 516 change them right away ... otherwise drivers could corrupt I/O
516 that's in progress for other SPI devices. 517 that's in progress for other SPI devices.
517 518
519 ** BUG ALERT: for some reason the first version of
520 ** many spi_master drivers seems to get this wrong.
521 ** When you code setup(), ASSUME that the controller
522 ** is actively processing transfers for another device.
523
518 master->transfer(struct spi_device *spi, struct spi_message *message) 524 master->transfer(struct spi_device *spi, struct spi_message *message)
519 This must not sleep. Its responsibility is arrange that the 525 This must not sleep. Its responsibility is arrange that the
520 transfer happens and its complete() callback is issued. The two 526 transfer happens and its complete() callback is issued. The two
diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
index a20a9066dc4c..1286f455992f 100644
--- a/Documentation/sysctl/00-INDEX
+++ b/Documentation/sysctl/00-INDEX
@@ -10,6 +10,8 @@ fs.txt
10 - documentation for /proc/sys/fs/*. 10 - documentation for /proc/sys/fs/*.
11kernel.txt 11kernel.txt
12 - documentation for /proc/sys/kernel/*. 12 - documentation for /proc/sys/kernel/*.
13net.txt
14 - documentation for /proc/sys/net/*.
13sunrpc.txt 15sunrpc.txt
14 - documentation for /proc/sys/sunrpc/*. 16 - documentation for /proc/sys/sunrpc/*.
15vm.txt 17vm.txt
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index f99254327ae5..1458448436cc 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -1,5 +1,6 @@
1Documentation for /proc/sys/fs/* kernel version 2.2.10 1Documentation for /proc/sys/fs/* kernel version 2.2.10
2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> 2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
3 (c) 2009, Shen Feng<shen@cn.fujitsu.com>
3 4
4For general info and legal blurb, please look in README. 5For general info and legal blurb, please look in README.
5 6
@@ -14,7 +15,12 @@ kernel. Since some of the files _can_ be used to screw up your
14system, it is advisable to read both documentation and source 15system, it is advisable to read both documentation and source
15before actually making adjustments. 16before actually making adjustments.
16 17
181. /proc/sys/fs
19----------------------------------------------------------
20
17Currently, these files are in /proc/sys/fs: 21Currently, these files are in /proc/sys/fs:
22- aio-max-nr
23- aio-nr
18- dentry-state 24- dentry-state
19- dquot-max 25- dquot-max
20- dquot-nr 26- dquot-nr
@@ -30,8 +36,15 @@ Currently, these files are in /proc/sys/fs:
30- super-max 36- super-max
31- super-nr 37- super-nr
32 38
33Documentation for the files in /proc/sys/fs/binfmt_misc is 39==============================================================
34in Documentation/binfmt_misc.txt. 40
41aio-nr & aio-max-nr:
42
43aio-nr is the running total of the number of events specified on the
44io_setup system call for all currently active aio contexts. If aio-nr
45reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
46raising aio-max-nr does not result in the pre-allocation or re-sizing
47of any kernel data structures.
35 48
36============================================================== 49==============================================================
37 50
@@ -178,3 +191,60 @@ requests. aio-max-nr allows you to change the maximum value
178aio-nr can grow to. 191aio-nr can grow to.
179 192
180============================================================== 193==============================================================
194
195
1962. /proc/sys/fs/binfmt_misc
197----------------------------------------------------------
198
199Documentation for the files in /proc/sys/fs/binfmt_misc is
200in Documentation/binfmt_misc.txt.
201
202
2033. /proc/sys/fs/mqueue - POSIX message queues filesystem
204----------------------------------------------------------
205
206The "mqueue" filesystem provides the necessary kernel features to enable the
207creation of a user space library that implements the POSIX message queues
208API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
209Interfaces specification.)
210
211The "mqueue" filesystem contains values for determining/setting the amount of
212resources used by the file system.
213
214/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
215maximum number of message queues allowed on the system.
216
217/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
218maximum number of messages in a queue value. In fact it is the limiting value
219for another (user) limit which is set in mq_open invocation. This attribute of
220a queue must be less or equal then msg_max.
221
222/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
223maximum message size value (it is every message queue's attribute set during
224its creation).
225
226
2274. /proc/sys/fs/epoll - Configuration options for the epoll interface
228--------------------------------------------------------
229
230This directory contains configuration options for the epoll(7) interface.
231
232max_user_instances
233------------------
234
235This is the maximum number of epoll file descriptors that a single user can
236have open at a given time. The default value is 128, and should be enough
237for normal users.
238
239max_user_watches
240----------------
241
242Every epoll file descriptor can store a number of files to be monitored
243for event readiness. Each one of these monitored files constitutes a "watch".
244This configuration option sets the maximum number of "watches" that are
245allowed for each user.
246Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
247on a 64bit one.
248The current default value for max_user_watches is the 1/32 of the available
249low memory, divided for the "watch" cost in bytes.
250
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a4ccdd1981cf..f11ca7979fa6 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -1,5 +1,6 @@
1Documentation for /proc/sys/kernel/* kernel version 2.2.10 1Documentation for /proc/sys/kernel/* kernel version 2.2.10
2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> 2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
3 (c) 2009, Shen Feng<shen@cn.fujitsu.com>
3 4
4For general info and legal blurb, please look in README. 5For general info and legal blurb, please look in README.
5 6
@@ -18,6 +19,7 @@ Currently, these files might (depending on your configuration)
18show up in /proc/sys/kernel: 19show up in /proc/sys/kernel:
19- acpi_video_flags 20- acpi_video_flags
20- acct 21- acct
22- auto_msgmni
21- core_pattern 23- core_pattern
22- core_uses_pid 24- core_uses_pid
23- ctrl-alt-del 25- ctrl-alt-del
@@ -33,6 +35,7 @@ show up in /proc/sys/kernel:
33- msgmax 35- msgmax
34- msgmnb 36- msgmnb
35- msgmni 37- msgmni
38- nmi_watchdog
36- osrelease 39- osrelease
37- ostype 40- ostype
38- overflowgid 41- overflowgid
@@ -40,6 +43,7 @@ show up in /proc/sys/kernel:
40- panic 43- panic
41- pid_max 44- pid_max
42- powersave-nap [ PPC only ] 45- powersave-nap [ PPC only ]
46- panic_on_unrecovered_nmi
43- printk 47- printk
44- randomize_va_space 48- randomize_va_space
45- real-root-dev ==> Documentation/initrd.txt 49- real-root-dev ==> Documentation/initrd.txt
@@ -55,6 +59,7 @@ show up in /proc/sys/kernel:
55- sysrq ==> Documentation/sysrq.txt 59- sysrq ==> Documentation/sysrq.txt
56- tainted 60- tainted
57- threads-max 61- threads-max
62- unknown_nmi_panic
58- version 63- version
59 64
60============================================================== 65==============================================================
@@ -381,3 +386,51 @@ can be ORed together:
381 512 - A kernel warning has occurred. 386 512 - A kernel warning has occurred.
3821024 - A module from drivers/staging was loaded. 3871024 - A module from drivers/staging was loaded.
383 388
389==============================================================
390
391auto_msgmni:
392
393Enables/Disables automatic recomputing of msgmni upon memory add/remove or
394upon ipc namespace creation/removal (see the msgmni description above).
395Echoing "1" into this file enables msgmni automatic recomputing.
396Echoing "0" turns it off.
397auto_msgmni default value is 1.
398
399==============================================================
400
401nmi_watchdog:
402
403Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
404the NMI watchdog is enabled and will continuously test all online cpus to
405determine whether or not they are still functioning properly. Currently,
406passing "nmi_watchdog=" parameter at boot time is required for this function
407to work.
408
409If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the
410NMI watchdog shares registers with oprofile. By disabling the NMI watchdog,
411oprofile may have more registers to utilize.
412
413==============================================================
414
415unknown_nmi_panic:
416
417The value in this file affects behavior of handling NMI. When the value is
418non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
419debugging information is displayed on console.
420
421NMI switch that most IA32 servers have fires unknown NMI up, for example.
422If a system hangs up, try pressing the NMI switch.
423
424==============================================================
425
426panic_on_unrecovered_nmi:
427
428The default Linux behaviour on an NMI of either memory or unknown is to continue
429operation. For many environments such as scientific computing it is preferable
430that the box is taken out and the error dealt with than an uncorrected
431parity/ECC error get propogated.
432
433A small number of systems do generate NMI's for bizarre random reasons such as
434power management so the default is off. That sysctl works like the existing
435panic controls already in that directory.
436
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
new file mode 100644
index 000000000000..df38ef046f8d
--- /dev/null
+++ b/Documentation/sysctl/net.txt
@@ -0,0 +1,175 @@
1Documentation for /proc/sys/net/* kernel version 2.4.0-test11-pre4
2 (c) 1999 Terrehon Bowden <terrehon@pacbell.net>
3 Bodo Bauer <bb@ricochet.net>
4 (c) 2000 Jorge Nerin <comandante@zaralinux.com>
5 (c) 2009 Shen Feng <shen@cn.fujitsu.com>
6
7For general info and legal blurb, please look in README.
8
9==============================================================
10
11This file contains the documentation for the sysctl files in
12/proc/sys/net and is valid for Linux kernel version 2.4.0-test11-pre4.
13
14The interface to the networking parts of the kernel is located in
15/proc/sys/net. The following table shows all possible subdirectories.You may
16see only some of them, depending on your kernel's configuration.
17
18
19Table : Subdirectories in /proc/sys/net
20..............................................................................
21 Directory Content Directory Content
22 core General parameter appletalk Appletalk protocol
23 unix Unix domain sockets netrom NET/ROM
24 802 E802 protocol ax25 AX25
25 ethernet Ethernet protocol rose X.25 PLP layer
26 ipv4 IP version 4 x25 X.25 protocol
27 ipx IPX token-ring IBM token ring
28 bridge Bridging decnet DEC net
29 ipv6 IP version 6
30..............................................................................
31
321. /proc/sys/net/core - Network core options
33-------------------------------------------------------
34
35rmem_default
36------------
37
38The default setting of the socket receive buffer in bytes.
39
40rmem_max
41--------
42
43The maximum receive socket buffer size in bytes.
44
45wmem_default
46------------
47
48The default setting (in bytes) of the socket send buffer.
49
50wmem_max
51--------
52
53The maximum send socket buffer size in bytes.
54
55message_burst and message_cost
56------------------------------
57
58These parameters are used to limit the warning messages written to the kernel
59log from the networking code. They enforce a rate limit to make a
60denial-of-service attack impossible. A higher message_cost factor, results in
61fewer messages that will be written. Message_burst controls when messages will
62be dropped. The default settings limit warning messages to one every five
63seconds.
64
65warnings
66--------
67
68This controls console messages from the networking stack that can occur because
69of problems on the network like duplicate address or bad checksums. Normally,
70this should be enabled, but if the problem persists the messages can be
71disabled.
72
73netdev_budget
74-------------
75
76Maximum number of packets taken from all interfaces in one polling cycle (NAPI
77poll). In one polling cycle interfaces which are registered to polling are
78probed in a round-robin manner. The limit of packets in one such probe can be
79set per-device via sysfs class/net/<device>/weight .
80
81netdev_max_backlog
82------------------
83
84Maximum number of packets, queued on the INPUT side, when the interface
85receives packets faster than kernel can process them.
86
87optmem_max
88----------
89
90Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
91of struct cmsghdr structures with appended data.
92
932. /proc/sys/net/unix - Parameters for Unix domain sockets
94-------------------------------------------------------
95
96There is only one file in this directory.
97unix_dgram_qlen limits the max number of datagrams queued in Unix domain
98socket's buffer. It will not take effect unless PF_UNIX flag is specified.
99
100
1013. /proc/sys/net/ipv4 - IPV4 settings
102-------------------------------------------------------
103Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
104descriptions of these entries.
105
106
1074. Appletalk
108-------------------------------------------------------
109
110The /proc/sys/net/appletalk directory holds the Appletalk configuration data
111when Appletalk is loaded. The configurable parameters are:
112
113aarp-expiry-time
114----------------
115
116The amount of time we keep an ARP entry before expiring it. Used to age out
117old hosts.
118
119aarp-resolve-time
120-----------------
121
122The amount of time we will spend trying to resolve an Appletalk address.
123
124aarp-retransmit-limit
125---------------------
126
127The number of times we will retransmit a query before giving up.
128
129aarp-tick-time
130--------------
131
132Controls the rate at which expires are checked.
133
134The directory /proc/net/appletalk holds the list of active Appletalk sockets
135on a machine.
136
137The fields indicate the DDP type, the local address (in network:node format)
138the remote address, the size of the transmit pending queue, the size of the
139received queue (bytes waiting for applications to read) the state and the uid
140owning the socket.
141
142/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
143shows the name of the interface, its Appletalk address, the network range on
144that address (or network number for phase 1 networks), and the status of the
145interface.
146
147/proc/net/atalk_route lists each known network route. It lists the target
148(network) that the route leads to, the router (may be directly connected), the
149route flags, and the device the route is using.
150
151
1525. IPX
153-------------------------------------------------------
154
155The IPX protocol has no tunable values in proc/sys/net.
156
157The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
158socket giving the local and remote addresses in Novell format (that is
159network:node:port). In accordance with the strange Novell tradition,
160everything but the port is in hex. Not_Connected is displayed for sockets that
161are not tied to a specific remote address. The Tx and Rx queue sizes indicate
162the number of bytes pending for transmission and reception. The state
163indicates the state the socket is in and the uid is the owning uid of the
164socket.
165
166The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
167it gives the network number, the node number, and indicates if the network is
168the primary network. It also indicates which device it is bound to (or
169Internal for internal networks) and the Frame Type if appropriate. Linux
170supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
171IPX.
172
173The /proc/net/ipx_route table holds a list of IPX routes. For each route it
174gives the destination network, the router node (or Directly) and the network
175address of the router (or Connected) for internal networks.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3197fc83bc51..b716d33912d8 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm:
39- nr_hugepages 39- nr_hugepages
40- nr_overcommit_hugepages 40- nr_overcommit_hugepages
41- nr_pdflush_threads 41- nr_pdflush_threads
42- nr_pdflush_threads_min
43- nr_pdflush_threads_max
42- nr_trim_pages (only if CONFIG_MMU=n) 44- nr_trim_pages (only if CONFIG_MMU=n)
43- numa_zonelist_order 45- numa_zonelist_order
44- oom_dump_tasks 46- oom_dump_tasks
@@ -88,6 +90,10 @@ will itself start writeback.
88If dirty_bytes is written, dirty_ratio becomes a function of its value 90If dirty_bytes is written, dirty_ratio becomes a function of its value
89(dirty_bytes / the amount of dirtyable system memory). 91(dirty_bytes / the amount of dirtyable system memory).
90 92
93Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
94value lower than this limit will be ignored and the old configuration will be
95retained.
96
91============================================================== 97==============================================================
92 98
93dirty_expire_centisecs 99dirty_expire_centisecs
@@ -463,6 +469,32 @@ The default value is 0.
463 469
464============================================================== 470==============================================================
465 471
472nr_pdflush_threads_min
473
474This value controls the minimum number of pdflush threads.
475
476At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
477threads for the kernel's lifetime.
478
479The default value is 2. The minimum value you can specify is 1, and
480the maximum value is the current setting of 'nr_pdflush_threads_max'.
481
482See 'nr_pdflush_threads_max' below for more information.
483
484==============================================================
485
486nr_pdflush_threads_max
487
488This value controls the maximum number of pdflush threads that can be
489created. The pdflush algorithm will create a new pdflush thread (up to
490this maximum) if no pdflush threads have been available for >= 1 second.
491
492The default value is 8. The minimum value you can specify is the
493current value of 'nr_pdflush_threads_min' and the
494maximum is 1000.
495
496==============================================================
497
466overcommit_memory: 498overcommit_memory:
467 499
468This value contains a flag that enables memory overcommitment. 500This value contains a flag that enables memory overcommitment.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718afb..cf42b820ff9d 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
81 81
82'i' - Send a SIGKILL to all processes, except for init. 82'i' - Send a SIGKILL to all processes, except for init.
83 83
84'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl.
85
84'k' - Secure Access Key (SAK) Kills all programs on the current virtual 86'k' - Secure Access Key (SAK) Kills all programs on the current virtual
85 console. NOTE: See important comments below in SAK section. 87 console. NOTE: See important comments below in SAK section.
86 88
@@ -113,6 +115,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
113 115
114'x' - Used by xmon interface on ppc/powerpc platforms. 116'x' - Used by xmon interface on ppc/powerpc platforms.
115 117
118'z' - Dump the ftrace buffer
119
116'0'-'9' - Sets the console log level, controlling which kernel messages 120'0'-'9' - Sets the console log level, controlling which kernel messages
117 will be printed to your console. ('0', for example would make 121 will be printed to your console. ('0', for example would make
118 it so that only emergency messages like PANICs or OOPSes would 122 it so that only emergency messages like PANICs or OOPSes would
@@ -160,6 +164,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you
160are unable to kill any other way, especially if it's spawning other 164are unable to kill any other way, especially if it's spawning other
161processes. 165processes.
162 166
167"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen
168(probably root) filesystem via the FIFREEZE ioctl.
169
163* Sometimes SysRq seems to get 'stuck' after using it, what can I do? 170* Sometimes SysRq seems to get 'stuck' after using it, what can I do?
164~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 171~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
165That happens to me, also. I've found that tapping shift, alt, and control 172That happens to me, also. I've found that tapping shift, alt, and control
diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt
new file mode 100644
index 000000000000..b3a232cae7f8
--- /dev/null
+++ b/Documentation/tomoyo.txt
@@ -0,0 +1,55 @@
1--- What is TOMOYO? ---
2
3TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
4
5LiveCD-based tutorials are available at
6http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/
7http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ .
8Though these tutorials use non-LSM version of TOMOYO, they are useful for you
9to know what TOMOYO is.
10
11--- How to enable TOMOYO? ---
12
13Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
14kernel's command line.
15
16Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details.
17
18--- Where is documentation? ---
19
20User <-> Kernel interface documentation is available at
21http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html .
22
23Materials we prepared for seminars and symposiums are available at
24http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
25Below lists are chosen from three aspects.
26
27What is TOMOYO?
28 TOMOYO Linux Overview
29 http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
30 TOMOYO Linux: pragmatic and manageable security for Linux
31 http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
32 TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
33 http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
34
35What can TOMOYO do?
36 Deep inside TOMOYO Linux
37 http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
38 The role of "pathname based access control" in security.
39 http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
40
41History of TOMOYO?
42 Realities of Mainlining
43 http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
44
45--- What is future plan? ---
46
47We believe that inode based security and name based security are complementary
48and both should be used together. But unfortunately, so far, we cannot enable
49multiple LSM modules at the same time. We feel sorry that you have to give up
50SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
51
52We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
53version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ .
54LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
55to port non-LSM version's functionalities to LSM versions.
diff --git a/Documentation/ftrace.txt b/Documentation/trace/ftrace.txt
index 803b1318b13d..fd9a3e693813 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -15,31 +15,31 @@ Introduction
15 15
16Ftrace is an internal tracer designed to help out developers and 16Ftrace is an internal tracer designed to help out developers and
17designers of systems to find what is going on inside the kernel. 17designers of systems to find what is going on inside the kernel.
18It can be used for debugging or analyzing latencies and performance 18It can be used for debugging or analyzing latencies and
19issues that take place outside of user-space. 19performance issues that take place outside of user-space.
20 20
21Although ftrace is the function tracer, it also includes an 21Although ftrace is the function tracer, it also includes an
22infrastructure that allows for other types of tracing. Some of the 22infrastructure that allows for other types of tracing. Some of
23tracers that are currently in ftrace include a tracer to trace 23the tracers that are currently in ftrace include a tracer to
24context switches, the time it takes for a high priority task to 24trace context switches, the time it takes for a high priority
25run after it was woken up, the time interrupts are disabled, and 25task to run after it was woken up, the time interrupts are
26more (ftrace allows for tracer plugins, which means that the list of 26disabled, and more (ftrace allows for tracer plugins, which
27tracers can always grow). 27means that the list of tracers can always grow).
28 28
29 29
30The File System 30The File System
31--------------- 31---------------
32 32
33Ftrace uses the debugfs file system to hold the control files as well 33Ftrace uses the debugfs file system to hold the control files as
34as the files to display output. 34well as the files to display output.
35 35
36To mount the debugfs system: 36To mount the debugfs system:
37 37
38 # mkdir /debug 38 # mkdir /debug
39 # mount -t debugfs nodev /debug 39 # mount -t debugfs nodev /debug
40 40
41(Note: it is more common to mount at /sys/kernel/debug, but for simplicity 41( Note: it is more common to mount at /sys/kernel/debug, but for
42 this document will use /debug) 42 simplicity this document will use /debug)
43 43
44That's it! (assuming that you have ftrace configured into your kernel) 44That's it! (assuming that you have ftrace configured into your kernel)
45 45
@@ -50,90 +50,124 @@ of ftrace. Here is a list of some of the key files:
50 50
51 Note: all time values are in microseconds. 51 Note: all time values are in microseconds.
52 52
53 current_tracer: This is used to set or display the current tracer 53 current_tracer:
54 that is configured. 54
55 55 This is used to set or display the current tracer
56 available_tracers: This holds the different types of tracers that 56 that is configured.
57 have been compiled into the kernel. The tracers 57
58 listed here can be configured by echoing their name 58 available_tracers:
59 into current_tracer. 59
60 60 This holds the different types of tracers that
61 tracing_enabled: This sets or displays whether the current_tracer 61 have been compiled into the kernel. The
62 is activated and tracing or not. Echo 0 into this 62 tracers listed here can be configured by
63 file to disable the tracer or 1 to enable it. 63 echoing their name into current_tracer.
64 64
65 trace: This file holds the output of the trace in a human readable 65 tracing_enabled:
66 format (described below). 66
67 67 This sets or displays whether the current_tracer
68 latency_trace: This file shows the same trace but the information 68 is activated and tracing or not. Echo 0 into this
69 is organized more to display possible latencies 69 file to disable the tracer or 1 to enable it.
70 in the system (described below). 70
71 71 trace:
72 trace_pipe: The output is the same as the "trace" file but this 72
73 file is meant to be streamed with live tracing. 73 This file holds the output of the trace in a human
74 Reads from this file will block until new data 74 readable format (described below).
75 is retrieved. Unlike the "trace" and "latency_trace" 75
76 files, this file is a consumer. This means reading 76 latency_trace:
77 from this file causes sequential reads to display 77
78 more current data. Once data is read from this 78 This file shows the same trace but the information
79 file, it is consumed, and will not be read 79 is organized more to display possible latencies
80 again with a sequential read. The "trace" and 80 in the system (described below).
81 "latency_trace" files are static, and if the 81
82 tracer is not adding more data, they will display 82 trace_pipe:
83 the same information every time they are read. 83
84 84 The output is the same as the "trace" file but this
85 trace_options: This file lets the user control the amount of data 85 file is meant to be streamed with live tracing.
86 that is displayed in one of the above output 86 Reads from this file will block until new data
87 files. 87 is retrieved. Unlike the "trace" and "latency_trace"
88 88 files, this file is a consumer. This means reading
89 trace_max_latency: Some of the tracers record the max latency. 89 from this file causes sequential reads to display
90 For example, the time interrupts are disabled. 90 more current data. Once data is read from this
91 This time is saved in this file. The max trace 91 file, it is consumed, and will not be read
92 will also be stored, and displayed by either 92 again with a sequential read. The "trace" and
93 "trace" or "latency_trace". A new max trace will 93 "latency_trace" files are static, and if the
94 only be recorded if the latency is greater than 94 tracer is not adding more data, they will display
95 the value in this file. (in microseconds) 95 the same information every time they are read.
96 96
97 buffer_size_kb: This sets or displays the number of kilobytes each CPU 97 trace_options:
98 buffer can hold. The tracer buffers are the same size 98
99 for each CPU. The displayed number is the size of the 99 This file lets the user control the amount of data
100 CPU buffer and not total size of all buffers. The 100 that is displayed in one of the above output
101 trace buffers are allocated in pages (blocks of memory 101 files.
102 that the kernel uses for allocation, usually 4 KB in size). 102
103 If the last page allocated has room for more bytes 103 tracing_max_latency:
104 than requested, the rest of the page will be used, 104
105 making the actual allocation bigger than requested. 105 Some of the tracers record the max latency.
106 (Note, the size may not be a multiple of the page size due 106 For example, the time interrupts are disabled.
107 to buffer managment overhead.) 107 This time is saved in this file. The max trace
108 108 will also be stored, and displayed by either
109 This can only be updated when the current_tracer 109 "trace" or "latency_trace". A new max trace will
110 is set to "nop". 110 only be recorded if the latency is greater than
111 111 the value in this file. (in microseconds)
112 tracing_cpumask: This is a mask that lets the user only trace 112
113 on specified CPUS. The format is a hex string 113 buffer_size_kb:
114 representing the CPUS. 114
115 115 This sets or displays the number of kilobytes each CPU
116 set_ftrace_filter: When dynamic ftrace is configured in (see the 116 buffer can hold. The tracer buffers are the same size
117 section below "dynamic ftrace"), the code is dynamically 117 for each CPU. The displayed number is the size of the
118 modified (code text rewrite) to disable calling of the 118 CPU buffer and not total size of all buffers. The
119 function profiler (mcount). This lets tracing be configured 119 trace buffers are allocated in pages (blocks of memory
120 in with practically no overhead in performance. This also 120 that the kernel uses for allocation, usually 4 KB in size).
121 has a side effect of enabling or disabling specific functions 121 If the last page allocated has room for more bytes
122 to be traced. Echoing names of functions into this file 122 than requested, the rest of the page will be used,
123 will limit the trace to only those functions. 123 making the actual allocation bigger than requested.
124 124 ( Note, the size may not be a multiple of the page size
125 set_ftrace_notrace: This has an effect opposite to that of 125 due to buffer managment overhead. )
126 set_ftrace_filter. Any function that is added here will not 126
127 be traced. If a function exists in both set_ftrace_filter 127 This can only be updated when the current_tracer
128 and set_ftrace_notrace, the function will _not_ be traced. 128 is set to "nop".
129 129
130 set_ftrace_pid: Have the function tracer only trace a single thread. 130 tracing_cpumask:
131 131
132 available_filter_functions: This lists the functions that ftrace 132 This is a mask that lets the user only trace
133 has processed and can trace. These are the function 133 on specified CPUS. The format is a hex string
134 names that you can pass to "set_ftrace_filter" or 134 representing the CPUS.
135 "set_ftrace_notrace". (See the section "dynamic ftrace" 135
136 below for more details.) 136 set_ftrace_filter:
137
138 When dynamic ftrace is configured in (see the
139 section below "dynamic ftrace"), the code is dynamically
140 modified (code text rewrite) to disable calling of the
141 function profiler (mcount). This lets tracing be configured
142 in with practically no overhead in performance. This also
143 has a side effect of enabling or disabling specific functions
144 to be traced. Echoing names of functions into this file
145 will limit the trace to only those functions.
146
147 set_ftrace_notrace:
148
149 This has an effect opposite to that of
150 set_ftrace_filter. Any function that is added here will not
151 be traced. If a function exists in both set_ftrace_filter
152 and set_ftrace_notrace, the function will _not_ be traced.
153
154 set_ftrace_pid:
155
156 Have the function tracer only trace a single thread.
157
158 set_graph_function:
159
160 Set a "trigger" function where tracing should start
161 with the function graph tracer (See the section
162 "dynamic ftrace" for more details).
163
164 available_filter_functions:
165
166 This lists the functions that ftrace
167 has processed and can trace. These are the function
168 names that you can pass to "set_ftrace_filter" or
169 "set_ftrace_notrace". (See the section "dynamic ftrace"
170 below for more details.)
137 171
138 172
139The Tracers 173The Tracers
@@ -141,36 +175,66 @@ The Tracers
141 175
142Here is the list of current tracers that may be configured. 176Here is the list of current tracers that may be configured.
143 177
144 function - function tracer that uses mcount to trace all functions. 178 "function"
179
180 Function call tracer to trace all kernel functions.
181
182 "function_graph_tracer"
183
184 Similar to the function tracer except that the
185 function tracer probes the functions on their entry
186 whereas the function graph tracer traces on both entry
187 and exit of the functions. It then provides the ability
188 to draw a graph of function calls similar to C code
189 source.
145 190
146 sched_switch - traces the context switches between tasks. 191 "sched_switch"
147 192
148 irqsoff - traces the areas that disable interrupts and saves 193 Traces the context switches and wakeups between tasks.
149 the trace with the longest max latency.
150 See tracing_max_latency. When a new max is recorded,
151 it replaces the old trace. It is best to view this
152 trace via the latency_trace file.
153 194
154 preemptoff - Similar to irqsoff but traces and records the amount of 195 "irqsoff"
155 time for which preemption is disabled.
156 196
157 preemptirqsoff - Similar to irqsoff and preemptoff, but traces and 197 Traces the areas that disable interrupts and saves
158 records the largest time for which irqs and/or preemption 198 the trace with the longest max latency.
159 is disabled. 199 See tracing_max_latency. When a new max is recorded,
200 it replaces the old trace. It is best to view this
201 trace via the latency_trace file.
160 202
161 wakeup - Traces and records the max latency that it takes for 203 "preemptoff"
162 the highest priority task to get scheduled after
163 it has been woken up.
164 204
165 nop - This is not a tracer. To remove all tracers from tracing 205 Similar to irqsoff but traces and records the amount of
166 simply echo "nop" into current_tracer. 206 time for which preemption is disabled.
207
208 "preemptirqsoff"
209
210 Similar to irqsoff and preemptoff, but traces and
211 records the largest time for which irqs and/or preemption
212 is disabled.
213
214 "wakeup"
215
216 Traces and records the max latency that it takes for
217 the highest priority task to get scheduled after
218 it has been woken up.
219
220 "hw-branch-tracer"
221
222 Uses the BTS CPU feature on x86 CPUs to traces all
223 branches executed.
224
225 "nop"
226
227 This is the "trace nothing" tracer. To remove all
228 tracers from tracing simply echo "nop" into
229 current_tracer.
167 230
168 231
169Examples of using the tracer 232Examples of using the tracer
170---------------------------- 233----------------------------
171 234
172Here are typical examples of using the tracers when controlling them only 235Here are typical examples of using the tracers when controlling
173with the debugfs interface (without using any user-land utilities). 236them only with the debugfs interface (without using any
237user-land utilities).
174 238
175Output format: 239Output format:
176-------------- 240--------------
@@ -187,16 +251,16 @@ Here is an example of the output format of the file "trace"
187 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput 251 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput
188 -------- 252 --------
189 253
190A header is printed with the tracer name that is represented by the trace. 254A header is printed with the tracer name that is represented by
191In this case the tracer is "function". Then a header showing the format. Task 255the trace. In this case the tracer is "function". Then a header
192name "bash", the task PID "4251", the CPU that it was running on 256showing the format. Task name "bash", the task PID "4251", the
193"01", the timestamp in <secs>.<usecs> format, the function name that was 257CPU that it was running on "01", the timestamp in <secs>.<usecs>
194traced "path_put" and the parent function that called this function 258format, the function name that was traced "path_put" and the
195"path_walk". The timestamp is the time at which the function was 259parent function that called this function "path_walk". The
196entered. 260timestamp is the time at which the function was entered.
197 261
198The sched_switch tracer also includes tracing of task wakeups and 262The sched_switch tracer also includes tracing of task wakeups
199context switches. 263and context switches.
200 264
201 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S 265 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
202 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S 266 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
@@ -205,8 +269,8 @@ context switches.
205 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R 269 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
206 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R 270 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
207 271
208Wake ups are represented by a "+" and the context switches are shown as 272Wake ups are represented by a "+" and the context switches are
209"==>". The format is: 273shown as "==>". The format is:
210 274
211 Context switches: 275 Context switches:
212 276
@@ -220,19 +284,20 @@ Wake ups are represented by a "+" and the context switches are shown as
220 284
221 <pid>:<prio>:<state> + <pid>:<prio>:<state> 285 <pid>:<prio>:<state> + <pid>:<prio>:<state>
222 286
223The prio is the internal kernel priority, which is the inverse of the 287The prio is the internal kernel priority, which is the inverse
224priority that is usually displayed by user-space tools. Zero represents 288of the priority that is usually displayed by user-space tools.
225the highest priority (99). Prio 100 starts the "nice" priorities with 289Zero represents the highest priority (99). Prio 100 starts the
226100 being equal to nice -20 and 139 being nice 19. The prio "140" is 290"nice" priorities with 100 being equal to nice -20 and 139 being
227reserved for the idle task which is the lowest priority thread (pid 0). 291nice 19. The prio "140" is reserved for the idle task which is
292the lowest priority thread (pid 0).
228 293
229 294
230Latency trace format 295Latency trace format
231-------------------- 296--------------------
232 297
233For traces that display latency times, the latency_trace file gives 298For traces that display latency times, the latency_trace file
234somewhat more information to see why a latency happened. Here is a typical 299gives somewhat more information to see why a latency happened.
235trace. 300Here is a typical trace.
236 301
237# tracer: irqsoff 302# tracer: irqsoff
238# 303#
@@ -259,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
259 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq) 324 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq)
260 325
261 326
327This shows that the current tracer is "irqsoff" tracing the time
328for which interrupts were disabled. It gives the trace version
329and the version of the kernel upon which this was executed on
330(2.6.26-rc8). Then it displays the max latency in microsecs (97
331us). The number of trace entries displayed and the total number
332recorded (both are three: #3/3). The type of preemption that was
333used (PREEMPT). VP, KP, SP, and HP are always zero and are
334reserved for later use. #P is the number of online CPUS (#P:2).
262 335
263This shows that the current tracer is "irqsoff" tracing the time for which 336The task is the process that was running when the latency
264interrupts were disabled. It gives the trace version and the version 337occurred. (swapper pid: 0).
265of the kernel upon which this was executed on (2.6.26-rc8). Then it displays
266the max latency in microsecs (97 us). The number of trace entries displayed
267and the total number recorded (both are three: #3/3). The type of
268preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
269and are reserved for later use. #P is the number of online CPUS (#P:2).
270
271The task is the process that was running when the latency occurred.
272(swapper pid: 0).
273 338
274The start and stop (the functions in which the interrupts were disabled and 339The start and stop (the functions in which the interrupts were
275enabled respectively) that caused the latencies: 340disabled and enabled respectively) that caused the latencies:
276 341
277 apic_timer_interrupt is where the interrupts were disabled. 342 apic_timer_interrupt is where the interrupts were disabled.
278 do_softirq is where they were enabled again. 343 do_softirq is where they were enabled again.
@@ -308,12 +373,12 @@ The above is mostly meaningful for kernel developers.
308 latency_trace file is relative to the start of the trace. 373 latency_trace file is relative to the start of the trace.
309 374
310 delay: This is just to help catch your eye a bit better. And 375 delay: This is just to help catch your eye a bit better. And
311 needs to be fixed to be only relative to the same CPU. 376 needs to be fixed to be only relative to the same CPU.
312 The marks are determined by the difference between this 377 The marks are determined by the difference between this
313 current trace and the next trace. 378 current trace and the next trace.
314 '!' - greater than preempt_mark_thresh (default 100) 379 '!' - greater than preempt_mark_thresh (default 100)
315 '+' - greater than 1 microsecond 380 '+' - greater than 1 microsecond
316 ' ' - less than or equal to 1 microsecond. 381 ' ' - less than or equal to 1 microsecond.
317 382
318 The rest is the same as the 'trace' file. 383 The rest is the same as the 'trace' file.
319 384
@@ -321,14 +386,15 @@ The above is mostly meaningful for kernel developers.
321trace_options 386trace_options
322------------- 387-------------
323 388
324The trace_options file is used to control what gets printed in the trace 389The trace_options file is used to control what gets printed in
325output. To see what is available, simply cat the file: 390the trace output. To see what is available, simply cat the file:
326 391
327 cat /debug/tracing/trace_options 392 cat /debug/tracing/trace_options
328 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
329 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
330 395
331To disable one of the options, echo in the option prepended with "no". 396To disable one of the options, echo in the option prepended with
397"no".
332 398
333 echo noprint-parent > /debug/tracing/trace_options 399 echo noprint-parent > /debug/tracing/trace_options
334 400
@@ -338,8 +404,8 @@ To enable an option, leave off the "no".
338 404
339Here are the available options: 405Here are the available options:
340 406
341 print-parent - On function traces, display the calling function 407 print-parent - On function traces, display the calling (parent)
342 as well as the function being traced. 408 function as well as the function being traced.
343 409
344 print-parent: 410 print-parent:
345 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul 411 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul
@@ -348,15 +414,16 @@ Here are the available options:
348 bash-4000 [01] 1477.606694: simple_strtoul 414 bash-4000 [01] 1477.606694: simple_strtoul
349 415
350 416
351 sym-offset - Display not only the function name, but also the offset 417 sym-offset - Display not only the function name, but also the
352 in the function. For example, instead of seeing just 418 offset in the function. For example, instead of
353 "ktime_get", you will see "ktime_get+0xb/0x20". 419 seeing just "ktime_get", you will see
420 "ktime_get+0xb/0x20".
354 421
355 sym-offset: 422 sym-offset:
356 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 423 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0
357 424
358 sym-addr - this will also display the function address as well as 425 sym-addr - this will also display the function address as well
359 the function name. 426 as the function name.
360 427
361 sym-addr: 428 sym-addr:
362 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 429 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
@@ -366,35 +433,41 @@ Here are the available options:
366 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 433 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
367 (+0.000ms): simple_strtoul (strict_strtoul) 434 (+0.000ms): simple_strtoul (strict_strtoul)
368 435
369 raw - This will display raw numbers. This option is best for use with 436 raw - This will display raw numbers. This option is best for
370 user applications that can translate the raw numbers better than 437 use with user applications that can translate the raw
371 having it done in the kernel. 438 numbers better than having it done in the kernel.
372 439
373 hex - Similar to raw, but the numbers will be in a hexadecimal format. 440 hex - Similar to raw, but the numbers will be in a hexadecimal
441 format.
374 442
375 bin - This will print out the formats in raw binary. 443 bin - This will print out the formats in raw binary.
376 444
377 block - TBD (needs update) 445 block - TBD (needs update)
378 446
379 stacktrace - This is one of the options that changes the trace itself. 447 stacktrace - This is one of the options that changes the trace
380 When a trace is recorded, so is the stack of functions. 448 itself. When a trace is recorded, so is the stack
381 This allows for back traces of trace sites. 449 of functions. This allows for back traces of
450 trace sites.
382 451
383 userstacktrace - This option changes the trace. 452 userstacktrace - This option changes the trace. It records a
384 It records a stacktrace of the current userspace thread. 453 stacktrace of the current userspace thread.
385 454
386 sym-userobj - when user stacktrace are enabled, look up which object the 455 sym-userobj - when user stacktrace are enabled, look up which
387 address belongs to, and print a relative address 456 object the address belongs to, and print a
388 This is especially useful when ASLR is on, otherwise you don't 457 relative address. This is especially useful when
389 get a chance to resolve the address to object/file/line after the app is no 458 ASLR is on, otherwise you don't get a chance to
390 longer running 459 resolve the address to object/file/line after
460 the app is no longer running
391 461
392 The lookup is performed when you read trace,trace_pipe,latency_trace. Example: 462 The lookup is performed when you read
463 trace,trace_pipe,latency_trace. Example:
393 464
394 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 465 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
395x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 466x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
396 467
397 sched-tree - TBD (any users??) 468 sched-tree - trace all tasks that are on the runqueue, at
469 every scheduling event. Will add overhead if
470 there's a lot of tasks running at once.
398 471
399 472
400sched_switch 473sched_switch
@@ -431,18 +504,19 @@ of how to use it.
431 [...] 504 [...]
432 505
433 506
434As we have discussed previously about this format, the header shows 507As we have discussed previously about this format, the header
435the name of the trace and points to the options. The "FUNCTION" 508shows the name of the trace and points to the options. The
436is a misnomer since here it represents the wake ups and context 509"FUNCTION" is a misnomer since here it represents the wake ups
437switches. 510and context switches.
438 511
439The sched_switch file only lists the wake ups (represented with '+') 512The sched_switch file only lists the wake ups (represented with
440and context switches ('==>') with the previous task or current task 513'+') and context switches ('==>') with the previous task or
441first followed by the next task or task waking up. The format for both 514current task first followed by the next task or task waking up.
442of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO 515The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
443is the inverse of the actual priority with zero (0) being the highest 516Remember that the KERNEL-PRIO is the inverse of the actual
444priority and the nice values starting at 100 (nice -20). Below is 517priority with zero (0) being the highest priority and the nice
445a quick chart to map the kernel priority to user land priorities. 518values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities.
446 520
447 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0
448 Kernel priority: 100 to 139 ==> user nice -20 to 19 522 Kernel priority: 100 to 139 ==> user nice -20 to 19
@@ -463,10 +537,10 @@ The task states are:
463ftrace_enabled 537ftrace_enabled
464-------------- 538--------------
465 539
466The following tracers (listed below) give different output depending 540The following tracers (listed below) give different output
467on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, 541depending on whether or not the sysctl ftrace_enabled is set. To
468one can either use the sysctl function or set it via the proc 542set ftrace_enabled, one can either use the sysctl function or
469file system interface. 543set it via the proc file system interface.
470 544
471 sysctl kernel.ftrace_enabled=1 545 sysctl kernel.ftrace_enabled=1
472 546
@@ -474,12 +548,12 @@ file system interface.
474 548
475 echo 1 > /proc/sys/kernel/ftrace_enabled 549 echo 1 > /proc/sys/kernel/ftrace_enabled
476 550
477To disable ftrace_enabled simply replace the '1' with '0' in 551To disable ftrace_enabled simply replace the '1' with '0' in the
478the above commands. 552above commands.
479 553
480When ftrace_enabled is set the tracers will also record the functions 554When ftrace_enabled is set the tracers will also record the
481that are within the trace. The descriptions of the tracers 555functions that are within the trace. The descriptions of the
482will also show an example with ftrace enabled. 556tracers will also show an example with ftrace enabled.
483 557
484 558
485irqsoff 559irqsoff
@@ -487,17 +561,18 @@ irqsoff
487 561
488When interrupts are disabled, the CPU can not react to any other 562When interrupts are disabled, the CPU can not react to any other
489external event (besides NMIs and SMIs). This prevents the timer 563external event (besides NMIs and SMIs). This prevents the timer
490interrupt from triggering or the mouse interrupt from letting the 564interrupt from triggering or the mouse interrupt from letting
491kernel know of a new mouse event. The result is a latency with the 565the kernel know of a new mouse event. The result is a latency
492reaction time. 566with the reaction time.
493 567
494The irqsoff tracer tracks the time for which interrupts are disabled. 568The irqsoff tracer tracks the time for which interrupts are
495When a new maximum latency is hit, the tracer saves the trace leading up 569disabled. When a new maximum latency is hit, the tracer saves
496to that latency point so that every time a new maximum is reached, the old 570the trace leading up to that latency point so that every time a
497saved trace is discarded and the new trace is saved. 571new maximum is reached, the old saved trace is discarded and the
572new trace is saved.
498 573
499To reset the maximum, echo 0 into tracing_max_latency. Here is an 574To reset the maximum, echo 0 into tracing_max_latency. Here is
500example: 575an example:
501 576
502 # echo irqsoff > /debug/tracing/current_tracer 577 # echo irqsoff > /debug/tracing/current_tracer
503 # echo 0 > /debug/tracing/tracing_max_latency 578 # echo 0 > /debug/tracing/tracing_max_latency
@@ -532,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26
532 607
533 608
534Here we see that that we had a latency of 12 microsecs (which is 609Here we see that that we had a latency of 12 microsecs (which is
535very good). The _write_lock_irq in sys_setpgid disabled interrupts. 610very good). The _write_lock_irq in sys_setpgid disabled
536The difference between the 12 and the displayed timestamp 14us occurred 611interrupts. The difference between the 12 and the displayed
537because the clock was incremented between the time of recording the max 612timestamp 14us occurred because the clock was incremented
538latency and the time of recording the function that had that latency. 613between the time of recording the max latency and the time of
614recording the function that had that latency.
539 615
540Note the above example had ftrace_enabled not set. If we set the 616Note the above example had ftrace_enabled not set. If we set the
541ftrace_enabled, we get a much larger output: 617ftrace_enabled, we get a much larger output:
@@ -586,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
586 662
587 663
588Here we traced a 50 microsecond latency. But we also see all the 664Here we traced a 50 microsecond latency. But we also see all the
589functions that were called during that time. Note that by enabling 665functions that were called during that time. Note that by
590function tracing, we incur an added overhead. This overhead may 666enabling function tracing, we incur an added overhead. This
591extend the latency times. But nevertheless, this trace has provided 667overhead may extend the latency times. But nevertheless, this
592some very helpful debugging information. 668trace has provided some very helpful debugging information.
593 669
594 670
595preemptoff 671preemptoff
596---------- 672----------
597 673
598When preemption is disabled, we may be able to receive interrupts but 674When preemption is disabled, we may be able to receive
599the task cannot be preempted and a higher priority task must wait 675interrupts but the task cannot be preempted and a higher
600for preemption to be enabled again before it can preempt a lower 676priority task must wait for preemption to be enabled again
601priority task. 677before it can preempt a lower priority task.
602 678
603The preemptoff tracer traces the places that disable preemption. 679The preemptoff tracer traces the places that disable preemption.
604Like the irqsoff tracer, it records the maximum latency for which preemption 680Like the irqsoff tracer, it records the maximum latency for
605was disabled. The control of preemptoff tracer is much like the irqsoff 681which preemption was disabled. The control of preemptoff tracer
606tracer. 682is much like the irqsoff tracer.
607 683
608 # echo preemptoff > /debug/tracing/current_tracer 684 # echo preemptoff > /debug/tracing/current_tracer
609 # echo 0 > /debug/tracing/tracing_max_latency 685 # echo 0 > /debug/tracing/tracing_max_latency
@@ -637,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
637 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) 713 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq)
638 714
639 715
640This has some more changes. Preemption was disabled when an interrupt 716This has some more changes. Preemption was disabled when an
641came in (notice the 'h'), and was enabled while doing a softirq. 717interrupt came in (notice the 'h'), and was enabled while doing
642(notice the 's'). But we also see that interrupts have been disabled 718a softirq. (notice the 's'). But we also see that interrupts
643when entering the preempt off section and leaving it (the 'd'). 719have been disabled when entering the preempt off section and
644We do not know if interrupts were enabled in the mean time. 720leaving it (the 'd'). We do not know if interrupts were enabled
721in the mean time.
645 722
646# tracer: preemptoff 723# tracer: preemptoff
647# 724#
@@ -700,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
700 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) 777 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq)
701 778
702 779
703The above is an example of the preemptoff trace with ftrace_enabled 780The above is an example of the preemptoff trace with
704set. Here we see that interrupts were disabled the entire time. 781ftrace_enabled set. Here we see that interrupts were disabled
705The irq_enter code lets us know that we entered an interrupt 'h'. 782the entire time. The irq_enter code lets us know that we entered
706Before that, the functions being traced still show that it is not 783an interrupt 'h'. Before that, the functions being traced still
707in an interrupt, but we can see from the functions themselves that 784show that it is not in an interrupt, but we can see from the
708this is not the case. 785functions themselves that this is not the case.
709 786
710Notice that __do_softirq when called does not have a preempt_count. 787Notice that __do_softirq when called does not have a
711It may seem that we missed a preempt enabling. What really happened 788preempt_count. It may seem that we missed a preempt enabling.
712is that the preempt count is held on the thread's stack and we 789What really happened is that the preempt count is held on the
713switched to the softirq stack (4K stacks in effect). The code 790thread's stack and we switched to the softirq stack (4K stacks
714does not copy the preempt count, but because interrupts are disabled, 791in effect). The code does not copy the preempt count, but
715we do not need to worry about it. Having a tracer like this is good 792because interrupts are disabled, we do not need to worry about
716for letting people know what really happens inside the kernel. 793it. Having a tracer like this is good for letting people know
794what really happens inside the kernel.
717 795
718 796
719preemptirqsoff 797preemptirqsoff
720-------------- 798--------------
721 799
722Knowing the locations that have interrupts disabled or preemption 800Knowing the locations that have interrupts disabled or
723disabled for the longest times is helpful. But sometimes we would 801preemption disabled for the longest times is helpful. But
724like to know when either preemption and/or interrupts are disabled. 802sometimes we would like to know when either preemption and/or
803interrupts are disabled.
725 804
726Consider the following code: 805Consider the following code:
727 806
@@ -741,11 +820,13 @@ The preemptoff tracer will record the total length of
741call_function_with_irqs_and_preemption_off() and 820call_function_with_irqs_and_preemption_off() and
742call_function_with_preemption_off(). 821call_function_with_preemption_off().
743 822
744But neither will trace the time that interrupts and/or preemption 823But neither will trace the time that interrupts and/or
745is disabled. This total time is the time that we can not schedule. 824preemption is disabled. This total time is the time that we can
746To record this time, use the preemptirqsoff tracer. 825not schedule. To record this time, use the preemptirqsoff
826tracer.
747 827
748Again, using this trace is much like the irqsoff and preemptoff tracers. 828Again, using this trace is much like the irqsoff and preemptoff
829tracers.
749 830
750 # echo preemptirqsoff > /debug/tracing/current_tracer 831 # echo preemptirqsoff > /debug/tracing/current_tracer
751 # echo 0 > /debug/tracing/tracing_max_latency 832 # echo 0 > /debug/tracing/tracing_max_latency
@@ -781,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
781 862
782 863
783The trace_hardirqs_off_thunk is called from assembly on x86 when 864The trace_hardirqs_off_thunk is called from assembly on x86 when
784interrupts are disabled in the assembly code. Without the function 865interrupts are disabled in the assembly code. Without the
785tracing, we do not know if interrupts were enabled within the preemption 866function tracing, we do not know if interrupts were enabled
786points. We do see that it started with preemption enabled. 867within the preemption points. We do see that it started with
868preemption enabled.
787 869
788Here is a trace with ftrace_enabled set: 870Here is a trace with ftrace_enabled set:
789 871
@@ -871,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
871 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) 953 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq)
872 954
873 955
874This is a very interesting trace. It started with the preemption of 956This is a very interesting trace. It started with the preemption
875the ls task. We see that the task had the "need_resched" bit set 957of the ls task. We see that the task had the "need_resched" bit
876via the 'N' in the trace. Interrupts were disabled before the spin_lock 958set via the 'N' in the trace. Interrupts were disabled before
877at the beginning of the trace. We see that a schedule took place to run 959the spin_lock at the beginning of the trace. We see that a
878sshd. When the interrupts were enabled, we took an interrupt. 960schedule took place to run sshd. When the interrupts were
879On return from the interrupt handler, the softirq ran. We took another 961enabled, we took an interrupt. On return from the interrupt
880interrupt while running the softirq as we see from the capital 'H'. 962handler, the softirq ran. We took another interrupt while
963running the softirq as we see from the capital 'H'.
881 964
882 965
883wakeup 966wakeup
884------ 967------
885 968
886In a Real-Time environment it is very important to know the wakeup 969In a Real-Time environment it is very important to know the
887time it takes for the highest priority task that is woken up to the 970wakeup time it takes for the highest priority task that is woken
888time that it executes. This is also known as "schedule latency". 971up to the time that it executes. This is also known as "schedule
889I stress the point that this is about RT tasks. It is also important 972latency". I stress the point that this is about RT tasks. It is
890to know the scheduling latency of non-RT tasks, but the average 973also important to know the scheduling latency of non-RT tasks,
891schedule latency is better for non-RT tasks. Tools like 974but the average schedule latency is better for non-RT tasks.
892LatencyTop are more appropriate for such measurements. 975Tools like LatencyTop are more appropriate for such
976measurements.
893 977
894Real-Time environments are interested in the worst case latency. 978Real-Time environments are interested in the worst case latency.
895That is the longest latency it takes for something to happen, and 979That is the longest latency it takes for something to happen,
896not the average. We can have a very fast scheduler that may only 980and not the average. We can have a very fast scheduler that may
897have a large latency once in a while, but that would not work well 981only have a large latency once in a while, but that would not
898with Real-Time tasks. The wakeup tracer was designed to record 982work well with Real-Time tasks. The wakeup tracer was designed
899the worst case wakeups of RT tasks. Non-RT tasks are not recorded 983to record the worst case wakeups of RT tasks. Non-RT tasks are
900because the tracer only records one worst case and tracing non-RT 984not recorded because the tracer only records one worst case and
901tasks that are unpredictable will overwrite the worst case latency 985tracing non-RT tasks that are unpredictable will overwrite the
902of RT tasks. 986worst case latency of RT tasks.
903 987
904Since this tracer only deals with RT tasks, we will run this slightly 988Since this tracer only deals with RT tasks, we will run this
905differently than we did with the previous tracers. Instead of performing 989slightly differently than we did with the previous tracers.
906an 'ls', we will run 'sleep 1' under 'chrt' which changes the 990Instead of performing an 'ls', we will run 'sleep 1' under
907priority of the task. 991'chrt' which changes the priority of the task.
908 992
909 # echo wakeup > /debug/tracing/current_tracer 993 # echo wakeup > /debug/tracing/current_tracer
910 # echo 0 > /debug/tracing/tracing_max_latency 994 # echo 0 > /debug/tracing/tracing_max_latency
@@ -934,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc8
934 <idle>-0 1d..4 4us : schedule (cpu_idle) 1018 <idle>-0 1d..4 4us : schedule (cpu_idle)
935 1019
936 1020
1021Running this on an idle system, we see that it only took 4
1022microseconds to perform the task switch. Note, since the trace
1023marker in the schedule is before the actual "switch", we stop
1024the tracing when the recorded task is about to schedule in. This
1025may change if we add a new marker at the end of the scheduler.
937 1026
938Running this on an idle system, we see that it only took 4 microseconds 1027Notice that the recorded task is 'sleep' with the PID of 4901
939to perform the task switch. Note, since the trace marker in the 1028and it has an rt_prio of 5. This priority is user-space priority
940schedule is before the actual "switch", we stop the tracing when 1029and not the internal kernel priority. The policy is 1 for
941the recorded task is about to schedule in. This may change if 1030SCHED_FIFO and 2 for SCHED_RR.
942we add a new marker at the end of the scheduler.
943
944Notice that the recorded task is 'sleep' with the PID of 4901 and it
945has an rt_prio of 5. This priority is user-space priority and not
946the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
947for SCHED_RR.
948 1031
949Doing the same with chrt -r 5 and ftrace_enabled set. 1032Doing the same with chrt -r 5 and ftrace_enabled set.
950 1033
@@ -1001,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline)
1001ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) 1084ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock)
1002ksoftirq-7 1d..4 50us : schedule (__cond_resched) 1085ksoftirq-7 1d..4 50us : schedule (__cond_resched)
1003 1086
1004The interrupt went off while running ksoftirqd. This task runs at 1087The interrupt went off while running ksoftirqd. This task runs
1005SCHED_OTHER. Why did not we see the 'N' set early? This may be 1088at SCHED_OTHER. Why did not we see the 'N' set early? This may
1006a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks 1089be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K
1007configured, the interrupt and softirq run with their own stack. 1090stacks configured, the interrupt and softirq run with their own
1008Some information is held on the top of the task's stack (need_resched 1091stack. Some information is held on the top of the task's stack
1009and preempt_count are both stored there). The setting of the NEED_RESCHED 1092(need_resched and preempt_count are both stored there). The
1010bit is done directly to the task's stack, but the reading of the 1093setting of the NEED_RESCHED bit is done directly to the task's
1011NEED_RESCHED is done by looking at the current stack, which in this case 1094stack, but the reading of the NEED_RESCHED is done by looking at
1012is the stack for the hard interrupt. This hides the fact that NEED_RESCHED 1095the current stack, which in this case is the stack for the hard
1013has been set. We do not see the 'N' until we switch back to the task's 1096interrupt. This hides the fact that NEED_RESCHED has been set.
1097We do not see the 'N' until we switch back to the task's
1014assigned stack. 1098assigned stack.
1015 1099
1016function 1100function
1017-------- 1101--------
1018 1102
1019This tracer is the function tracer. Enabling the function tracer 1103This tracer is the function tracer. Enabling the function tracer
1020can be done from the debug file system. Make sure the ftrace_enabled is 1104can be done from the debug file system. Make sure the
1021set; otherwise this tracer is a nop. 1105ftrace_enabled is set; otherwise this tracer is a nop.
1022 1106
1023 # sysctl kernel.ftrace_enabled=1 1107 # sysctl kernel.ftrace_enabled=1
1024 # echo function > /debug/tracing/current_tracer 1108 # echo function > /debug/tracing/current_tracer
@@ -1048,14 +1132,15 @@ set; otherwise this tracer is a nop.
1048[...] 1132[...]
1049 1133
1050 1134
1051Note: function tracer uses ring buffers to store the above entries. 1135Note: function tracer uses ring buffers to store the above
1052The newest data may overwrite the oldest data. Sometimes using echo to 1136entries. The newest data may overwrite the oldest data.
1053stop the trace is not sufficient because the tracing could have overwritten 1137Sometimes using echo to stop the trace is not sufficient because
1054the data that you wanted to record. For this reason, it is sometimes better to 1138the tracing could have overwritten the data that you wanted to
1055disable tracing directly from a program. This allows you to stop the 1139record. For this reason, it is sometimes better to disable
1056tracing at the point that you hit the part that you are interested in. 1140tracing directly from a program. This allows you to stop the
1057To disable the tracing directly from a C program, something like following 1141tracing at the point that you hit the part that you are
1058code snippet can be used: 1142interested in. To disable the tracing directly from a C program,
1143something like following code snippet can be used:
1059 1144
1060int trace_fd; 1145int trace_fd;
1061[...] 1146[...]
@@ -1070,10 +1155,10 @@ int main(int argc, char *argv[]) {
1070} 1155}
1071 1156
1072Note: Here we hard coded the path name. The debugfs mount is not 1157Note: Here we hard coded the path name. The debugfs mount is not
1073guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). 1158guaranteed to be at /debug (and is more commonly at
1074For simple one time traces, the above is sufficent. For anything else, 1159/sys/kernel/debug). For simple one time traces, the above is
1075a search through /proc/mounts may be needed to find where the debugfs 1160sufficent. For anything else, a search through /proc/mounts may
1076file-system is mounted. 1161be needed to find where the debugfs file-system is mounted.
1077 1162
1078 1163
1079Single thread tracing 1164Single thread tracing
@@ -1152,49 +1237,297 @@ int main (int argc, char **argv)
1152 return 0; 1237 return 0;
1153} 1238}
1154 1239
1240
1241hw-branch-tracer (x86 only)
1242---------------------------
1243
1244This tracer uses the x86 last branch tracing hardware feature to
1245collect a branch trace on all cpus with relatively low overhead.
1246
1247The tracer uses a fixed-size circular buffer per cpu and only
1248traces ring 0 branches. The trace file dumps that buffer in the
1249following format:
1250
1251# tracer: hw-branch-tracer
1252#
1253# CPU# TO <- FROM
1254 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6
1255 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a
1256 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf
1257 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf
1258 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a
1259 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf
1260
1261
1262The tracer may be used to dump the trace for the oops'ing cpu on
1263a kernel oops into the system log. To enable this,
1264ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1265can either use the sysctl function or set it via the proc system
1266interface.
1267
1268 sysctl kernel.ftrace_dump_on_oops=1
1269
1270or
1271
1272 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
1273
1274
1275Here's an example of such a dump after a null pointer
1276dereference in a kernel module:
1277
1278[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
1279[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops]
1280[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0
1281[57848.106019] Oops: 0002 [#1] SMP
1282[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus
1283[57848.106019] Dumping ftrace buffer:
1284[57848.106019] ---------------------------------
1285[...]
1286[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24
1287[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165
1288[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165
1289[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165
1290[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165
1291[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops]
1292[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30
1293[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b
1294[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31
1295[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1
1296[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30
1297[...]
1298[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2
1299[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881
1300[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881
1301[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96
1302[...]
1303[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3
1304[57848.106019] ---------------------------------
1305[57848.106019] CPU 0
1306[57848.106019] Modules linked in: oops
1307[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23
1308[57848.106019] RIP: 0010:[<ffffffffa0000006>] [<ffffffffa0000006>] open+0x6/0x14 [oops]
1309[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246
1310[...]
1311
1312
1313function graph tracer
1314---------------------------
1315
1316This tracer is similar to the function tracer except that it
1317probes a function on its entry and its exit. This is done by
1318using a dynamically allocated stack of return addresses in each
1319task_struct. On function entry the tracer overwrites the return
1320address of each function traced to set a custom probe. Thus the
1321original return address is stored on the stack of return address
1322in the task_struct.
1323
1324Probing on both ends of a function leads to special features
1325such as:
1326
1327- measure of a function's time execution
1328- having a reliable call stack to draw function calls graph
1329
1330This tracer is useful in several situations:
1331
1332- you want to find the reason of a strange kernel behavior and
1333 need to see what happens in detail on any areas (or specific
1334 ones).
1335
1336- you are experiencing weird latencies but it's difficult to
1337 find its origin.
1338
1339- you want to find quickly which path is taken by a specific
1340 function
1341
1342- you just want to peek inside a working kernel and want to see
1343 what happens there.
1344
1345# tracer: function_graph
1346#
1347# CPU DURATION FUNCTION CALLS
1348# | | | | | | |
1349
1350 0) | sys_open() {
1351 0) | do_sys_open() {
1352 0) | getname() {
1353 0) | kmem_cache_alloc() {
1354 0) 1.382 us | __might_sleep();
1355 0) 2.478 us | }
1356 0) | strncpy_from_user() {
1357 0) | might_fault() {
1358 0) 1.389 us | __might_sleep();
1359 0) 2.553 us | }
1360 0) 3.807 us | }
1361 0) 7.876 us | }
1362 0) | alloc_fd() {
1363 0) 0.668 us | _spin_lock();
1364 0) 0.570 us | expand_files();
1365 0) 0.586 us | _spin_unlock();
1366
1367
1368There are several columns that can be dynamically
1369enabled/disabled. You can use every combination of options you
1370want, depending on your needs.
1371
1372- The cpu number on which the function executed is default
1373 enabled. It is sometimes better to only trace one cpu (see
1374 tracing_cpu_mask file) or you might sometimes see unordered
1375 function calls while cpu tracing switch.
1376
1377 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options
1378 show: echo funcgraph-cpu > /debug/tracing/trace_options
1379
1380- The duration (function's time of execution) is displayed on
1381 the closing bracket line of a function or on the same line
1382 than the current function in case of a leaf one. It is default
1383 enabled.
1384
1385 hide: echo nofuncgraph-duration > /debug/tracing/trace_options
1386 show: echo funcgraph-duration > /debug/tracing/trace_options
1387
1388- The overhead field precedes the duration field in case of
1389 reached duration thresholds.
1390
1391 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options
1392 show: echo funcgraph-overhead > /debug/tracing/trace_options
1393 depends on: funcgraph-duration
1394
1395 ie:
1396
1397 0) | up_write() {
1398 0) 0.646 us | _spin_lock_irqsave();
1399 0) 0.684 us | _spin_unlock_irqrestore();
1400 0) 3.123 us | }
1401 0) 0.548 us | fput();
1402 0) + 58.628 us | }
1403
1404 [...]
1405
1406 0) | putname() {
1407 0) | kmem_cache_free() {
1408 0) 0.518 us | __phys_addr();
1409 0) 1.757 us | }
1410 0) 2.861 us | }
1411 0) ! 115.305 us | }
1412 0) ! 116.402 us | }
1413
1414 + means that the function exceeded 10 usecs.
1415 ! means that the function exceeded 100 usecs.
1416
1417
1418- The task/pid field displays the thread cmdline and pid which
1419 executed the function. It is default disabled.
1420
1421 hide: echo nofuncgraph-proc > /debug/tracing/trace_options
1422 show: echo funcgraph-proc > /debug/tracing/trace_options
1423
1424 ie:
1425
1426 # tracer: function_graph
1427 #
1428 # CPU TASK/PID DURATION FUNCTION CALLS
1429 # | | | | | | | | |
1430 0) sh-4802 | | d_free() {
1431 0) sh-4802 | | call_rcu() {
1432 0) sh-4802 | | __call_rcu() {
1433 0) sh-4802 | 0.616 us | rcu_process_gp_end();
1434 0) sh-4802 | 0.586 us | check_for_new_grace_period();
1435 0) sh-4802 | 2.899 us | }
1436 0) sh-4802 | 4.040 us | }
1437 0) sh-4802 | 5.151 us | }
1438 0) sh-4802 | + 49.370 us | }
1439
1440
1441- The absolute time field is an absolute timestamp given by the
1442 system clock since it started. A snapshot of this time is
1443 given on each entry/exit of functions
1444
1445 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options
1446 show: echo funcgraph-abstime > /debug/tracing/trace_options
1447
1448 ie:
1449
1450 #
1451 # TIME CPU DURATION FUNCTION CALLS
1452 # | | | | | | | |
1453 360.774522 | 1) 0.541 us | }
1454 360.774522 | 1) 4.663 us | }
1455 360.774523 | 1) 0.541 us | __wake_up_bit();
1456 360.774524 | 1) 6.796 us | }
1457 360.774524 | 1) 7.952 us | }
1458 360.774525 | 1) 9.063 us | }
1459 360.774525 | 1) 0.615 us | journal_mark_dirty();
1460 360.774527 | 1) 0.578 us | __brelse();
1461 360.774528 | 1) | reiserfs_prepare_for_journal() {
1462 360.774528 | 1) | unlock_buffer() {
1463 360.774529 | 1) | wake_up_bit() {
1464 360.774529 | 1) | bit_waitqueue() {
1465 360.774530 | 1) 0.594 us | __phys_addr();
1466
1467
1468You can put some comments on specific functions by using
1469trace_printk() For example, if you want to put a comment inside
1470the __might_sleep() function, you just have to include
1471<linux/ftrace.h> and call trace_printk() inside __might_sleep()
1472
1473trace_printk("I'm a comment!\n")
1474
1475will produce:
1476
1477 1) | __might_sleep() {
1478 1) | /* I'm a comment! */
1479 1) 1.449 us | }
1480
1481
1482You might find other useful features for this tracer in the
1483following "dynamic ftrace" section such as tracing only specific
1484functions or tasks.
1485
1155dynamic ftrace 1486dynamic ftrace
1156-------------- 1487--------------
1157 1488
1158If CONFIG_DYNAMIC_FTRACE is set, the system will run with 1489If CONFIG_DYNAMIC_FTRACE is set, the system will run with
1159virtually no overhead when function tracing is disabled. The way 1490virtually no overhead when function tracing is disabled. The way
1160this works is the mcount function call (placed at the start of 1491this works is the mcount function call (placed at the start of
1161every kernel function, produced by the -pg switch in gcc), starts 1492every kernel function, produced by the -pg switch in gcc),
1162of pointing to a simple return. (Enabling FTRACE will include the 1493starts of pointing to a simple return. (Enabling FTRACE will
1163-pg switch in the compiling of the kernel.) 1494include the -pg switch in the compiling of the kernel.)
1164 1495
1165At compile time every C file object is run through the 1496At compile time every C file object is run through the
1166recordmcount.pl script (located in the scripts directory). This 1497recordmcount.pl script (located in the scripts directory). This
1167script will process the C object using objdump to find all the 1498script will process the C object using objdump to find all the
1168locations in the .text section that call mcount. (Note, only 1499locations in the .text section that call mcount. (Note, only the
1169the .text section is processed, since processing other sections 1500.text section is processed, since processing other sections like
1170like .init.text may cause races due to those sections being freed). 1501.init.text may cause races due to those sections being freed).
1171 1502
1172A new section called "__mcount_loc" is created that holds references 1503A new section called "__mcount_loc" is created that holds
1173to all the mcount call sites in the .text section. This section is 1504references to all the mcount call sites in the .text section.
1174compiled back into the original object. The final linker will add 1505This section is compiled back into the original object. The
1175all these references into a single table. 1506final linker will add all these references into a single table.
1176 1507
1177On boot up, before SMP is initialized, the dynamic ftrace code 1508On boot up, before SMP is initialized, the dynamic ftrace code
1178scans this table and updates all the locations into nops. It also 1509scans this table and updates all the locations into nops. It
1179records the locations, which are added to the available_filter_functions 1510also records the locations, which are added to the
1180list. Modules are processed as they are loaded and before they are 1511available_filter_functions list. Modules are processed as they
1181executed. When a module is unloaded, it also removes its functions from 1512are loaded and before they are executed. When a module is
1182the ftrace function list. This is automatic in the module unload 1513unloaded, it also removes its functions from the ftrace function
1183code, and the module author does not need to worry about it. 1514list. This is automatic in the module unload code, and the
1184 1515module author does not need to worry about it.
1185When tracing is enabled, kstop_machine is called to prevent races 1516
1186with the CPUS executing code being modified (which can cause the 1517When tracing is enabled, kstop_machine is called to prevent
1187CPU to do undesireable things), and the nops are patched back 1518races with the CPUS executing code being modified (which can
1188to calls. But this time, they do not call mcount (which is just 1519cause the CPU to do undesireable things), and the nops are
1189a function stub). They now call into the ftrace infrastructure. 1520patched back to calls. But this time, they do not call mcount
1521(which is just a function stub). They now call into the ftrace
1522infrastructure.
1190 1523
1191One special side-effect to the recording of the functions being 1524One special side-effect to the recording of the functions being
1192traced is that we can now selectively choose which functions we 1525traced is that we can now selectively choose which functions we
1193wish to trace and which ones we want the mcount calls to remain as 1526wish to trace and which ones we want the mcount calls to remain
1194nops. 1527as nops.
1195 1528
1196Two files are used, one for enabling and one for disabling the tracing 1529Two files are used, one for enabling and one for disabling the
1197of specified functions. They are: 1530tracing of specified functions. They are:
1198 1531
1199 set_ftrace_filter 1532 set_ftrace_filter
1200 1533
@@ -1202,8 +1535,8 @@ and
1202 1535
1203 set_ftrace_notrace 1536 set_ftrace_notrace
1204 1537
1205A list of available functions that you can add to these files is listed 1538A list of available functions that you can add to these files is
1206in: 1539listed in:
1207 1540
1208 available_filter_functions 1541 available_filter_functions
1209 1542
@@ -1240,8 +1573,8 @@ hrtimer_interrupt
1240sys_nanosleep 1573sys_nanosleep
1241 1574
1242 1575
1243Perhaps this is not enough. The filters also allow simple wild cards. 1576Perhaps this is not enough. The filters also allow simple wild
1244Only the following are currently available 1577cards. Only the following are currently available
1245 1578
1246 <match>* - will match functions that begin with <match> 1579 <match>* - will match functions that begin with <match>
1247 *<match> - will match functions that end with <match> 1580 *<match> - will match functions that end with <match>
@@ -1251,9 +1584,9 @@ These are the only wild cards which are supported.
1251 1584
1252 <match>*<match> will not work. 1585 <match>*<match> will not work.
1253 1586
1254Note: It is better to use quotes to enclose the wild cards, otherwise 1587Note: It is better to use quotes to enclose the wild cards,
1255 the shell may expand the parameters into names of files in the local 1588 otherwise the shell may expand the parameters into names
1256 directory. 1589 of files in the local directory.
1257 1590
1258 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1591 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
1259 1592
@@ -1299,7 +1632,8 @@ This is because the '>' and '>>' act just like they do in bash.
1299To rewrite the filters, use '>' 1632To rewrite the filters, use '>'
1300To append to the filters, use '>>' 1633To append to the filters, use '>>'
1301 1634
1302To clear out a filter so that all functions will be recorded again: 1635To clear out a filter so that all functions will be recorded
1636again:
1303 1637
1304 # echo > /debug/tracing/set_ftrace_filter 1638 # echo > /debug/tracing/set_ftrace_filter
1305 # cat /debug/tracing/set_ftrace_filter 1639 # cat /debug/tracing/set_ftrace_filter
@@ -1331,7 +1665,8 @@ hrtimer_get_res
1331hrtimer_init_sleeper 1665hrtimer_init_sleeper
1332 1666
1333 1667
1334The set_ftrace_notrace prevents those functions from being traced. 1668The set_ftrace_notrace prevents those functions from being
1669traced.
1335 1670
1336 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1671 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
1337 1672
@@ -1353,13 +1688,75 @@ Produces:
1353 1688
1354We can see that there's no more lock or preempt tracing. 1689We can see that there's no more lock or preempt tracing.
1355 1690
1691
1692Dynamic ftrace with the function graph tracer
1693---------------------------------------------
1694
1695Although what has been explained above concerns both the
1696function tracer and the function-graph-tracer, there are some
1697special features only available in the function-graph tracer.
1698
1699If you want to trace only one function and all of its children,
1700you just have to echo its name into set_graph_function:
1701
1702 echo __do_fault > set_graph_function
1703
1704will produce the following "expanded" trace of the __do_fault()
1705function:
1706
1707 0) | __do_fault() {
1708 0) | filemap_fault() {
1709 0) | find_lock_page() {
1710 0) 0.804 us | find_get_page();
1711 0) | __might_sleep() {
1712 0) 1.329 us | }
1713 0) 3.904 us | }
1714 0) 4.979 us | }
1715 0) 0.653 us | _spin_lock();
1716 0) 0.578 us | page_add_file_rmap();
1717 0) 0.525 us | native_set_pte_at();
1718 0) 0.585 us | _spin_unlock();
1719 0) | unlock_page() {
1720 0) 0.541 us | page_waitqueue();
1721 0) 0.639 us | __wake_up_bit();
1722 0) 2.786 us | }
1723 0) + 14.237 us | }
1724 0) | __do_fault() {
1725 0) | filemap_fault() {
1726 0) | find_lock_page() {
1727 0) 0.698 us | find_get_page();
1728 0) | __might_sleep() {
1729 0) 1.412 us | }
1730 0) 3.950 us | }
1731 0) 5.098 us | }
1732 0) 0.631 us | _spin_lock();
1733 0) 0.571 us | page_add_file_rmap();
1734 0) 0.526 us | native_set_pte_at();
1735 0) 0.586 us | _spin_unlock();
1736 0) | unlock_page() {
1737 0) 0.533 us | page_waitqueue();
1738 0) 0.638 us | __wake_up_bit();
1739 0) 2.793 us | }
1740 0) + 14.012 us | }
1741
1742You can also expand several functions at once:
1743
1744 echo sys_open > set_graph_function
1745 echo sys_close >> set_graph_function
1746
1747Now if you want to go back to trace all functions you can clear
1748this special filter via:
1749
1750 echo > set_graph_function
1751
1752
1356trace_pipe 1753trace_pipe
1357---------- 1754----------
1358 1755
1359The trace_pipe outputs the same content as the trace file, but the effect 1756The trace_pipe outputs the same content as the trace file, but
1360on the tracing is different. Every read from trace_pipe is consumed. 1757the effect on the tracing is different. Every read from
1361This means that subsequent reads will be different. The trace 1758trace_pipe is consumed. This means that subsequent reads will be
1362is live. 1759different. The trace is live.
1363 1760
1364 # echo function > /debug/tracing/current_tracer 1761 # echo function > /debug/tracing/current_tracer
1365 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1762 # cat /debug/tracing/trace_pipe > /tmp/trace.out &
@@ -1387,38 +1784,45 @@ is live.
1387 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up 1784 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
1388 1785
1389 1786
1390Note, reading the trace_pipe file will block until more input is added. 1787Note, reading the trace_pipe file will block until more input is
1391By changing the tracer, trace_pipe will issue an EOF. We needed 1788added. By changing the tracer, trace_pipe will issue an EOF. We
1392to set the function tracer _before_ we "cat" the trace_pipe file. 1789needed to set the function tracer _before_ we "cat" the
1790trace_pipe file.
1393 1791
1394 1792
1395trace entries 1793trace entries
1396------------- 1794-------------
1397 1795
1398Having too much or not enough data can be troublesome in diagnosing 1796Having too much or not enough data can be troublesome in
1399an issue in the kernel. The file buffer_size_kb is used to modify 1797diagnosing an issue in the kernel. The file buffer_size_kb is
1400the size of the internal trace buffers. The number listed 1798used to modify the size of the internal trace buffers. The
1401is the number of entries that can be recorded per CPU. To know 1799number listed is the number of entries that can be recorded per
1402the full size, multiply the number of possible CPUS with the 1800CPU. To know the full size, multiply the number of possible CPUS
1403number of entries. 1801with the number of entries.
1404 1802
1405 # cat /debug/tracing/buffer_size_kb 1803 # cat /debug/tracing/buffer_size_kb
14061408 (units kilobytes) 18041408 (units kilobytes)
1407 1805
1408Note, to modify this, you must have tracing completely disabled. To do that, 1806Note, to modify this, you must have tracing completely disabled.
1409echo "nop" into the current_tracer. If the current_tracer is not set 1807To do that, echo "nop" into the current_tracer. If the
1410to "nop", an EINVAL error will be returned. 1808current_tracer is not set to "nop", an EINVAL error will be
1809returned.
1411 1810
1412 # echo nop > /debug/tracing/current_tracer 1811 # echo nop > /debug/tracing/current_tracer
1413 # echo 10000 > /debug/tracing/buffer_size_kb 1812 # echo 10000 > /debug/tracing/buffer_size_kb
1414 # cat /debug/tracing/buffer_size_kb 1813 # cat /debug/tracing/buffer_size_kb
141510000 (units kilobytes) 181410000 (units kilobytes)
1416 1815
1417The number of pages which will be allocated is limited to a percentage 1816The number of pages which will be allocated is limited to a
1418of available memory. Allocating too much will produce an error. 1817percentage of available memory. Allocating too much will produce
1818an error.
1419 1819
1420 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1820 # echo 1000000000000 > /debug/tracing/buffer_size_kb
1421-bash: echo: write error: Cannot allocate memory 1821-bash: echo: write error: Cannot allocate memory
1422 # cat /debug/tracing/buffer_size_kb 1822 # cat /debug/tracing/buffer_size_kb
142385 182385
1424 1824
1825-----------
1826
1827More details can be found in the source code, in the
1828kernel/tracing/*.c files.
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
new file mode 100644
index 000000000000..a956d9b7f943
--- /dev/null
+++ b/Documentation/trace/kmemtrace.txt
@@ -0,0 +1,126 @@
1 kmemtrace - Kernel Memory Tracer
2
3 by Eduard - Gabriel Munteanu
4 <eduard.munteanu@linux360.ro>
5
6I. Introduction
7===============
8
9kmemtrace helps kernel developers figure out two things:
101) how different allocators (SLAB, SLUB etc.) perform
112) how kernel code allocates memory and how much
12
13To do this, we trace every allocation and export information to the userspace
14through the relay interface. We export things such as the number of requested
15bytes, the number of bytes actually allocated (i.e. including internal
16fragmentation), whether this is a slab allocation or a plain kmalloc() and so
17on.
18
19The actual analysis is performed by a userspace tool (see section III for
20details on where to get it from). It logs the data exported by the kernel,
21processes it and (as of writing this) can provide the following information:
22- the total amount of memory allocated and fragmentation per call-site
23- the amount of memory allocated and fragmentation per allocation
24- total memory allocated and fragmentation in the collected dataset
25- number of cross-CPU allocation and frees (makes sense in NUMA environments)
26
27Moreover, it can potentially find inconsistent and erroneous behavior in
28kernel code, such as using slab free functions on kmalloc'ed memory or
29allocating less memory than requested (but not truly failed allocations).
30
31kmemtrace also makes provisions for tracing on some arch and analysing the
32data on another.
33
34II. Design and goals
35====================
36
37kmemtrace was designed to handle rather large amounts of data. Thus, it uses
38the relay interface to export whatever is logged to userspace, which then
39stores it. Analysis and reporting is done asynchronously, that is, after the
40data is collected and stored. By design, it allows one to log and analyse
41on different machines and different arches.
42
43As of writing this, the ABI is not considered stable, though it might not
44change much. However, no guarantees are made about compatibility yet. When
45deemed stable, the ABI should still allow easy extension while maintaining
46backward compatibility. This is described further in Documentation/ABI.
47
48Summary of design goals:
49 - allow logging and analysis to be done across different machines
50 - be fast and anticipate usage in high-load environments (*)
51 - be reasonably extensible
52 - make it possible for GNU/Linux distributions to have kmemtrace
53 included in their repositories
54
55(*) - one of the reasons Pekka Enberg's original userspace data analysis
56 tool's code was rewritten from Perl to C (although this is more than a
57 simple conversion)
58
59
60III. Quick usage guide
61======================
62
631) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
64CONFIG_KMEMTRACE).
65
662) Get the userspace tool and build it:
67$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository
68$ cd kmemtrace-user/
69$ ./autogen.sh
70$ ./configure
71$ make
72
733) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
74'single' runlevel (so that relay buffers don't fill up easily), and run
75kmemtrace:
76# '$' does not mean user, but root here.
77$ mount -t debugfs none /sys/kernel/debug
78$ mount -t proc none /proc
79$ cd path/to/kmemtrace-user/
80$ ./kmemtraced
81Wait a bit, then stop it with CTRL+C.
82$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
83 # overrun, should
84 # be zero.
85$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
86 check its correctness]
87$ ./kmemtrace-report
88
89Now you should have a nice and short summary of how the allocator performs.
90
91IV. FAQ and known issues
92========================
93
94Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
95this? Should I worry?
96A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
97large the number is. You can fix it by supplying a higher
98'kmemtrace.subbufs=N' kernel parameter.
99---
100
101Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
102A: This is a bug and should be reported. It can occur for a variety of
103reasons:
104 - possible bugs in relay code
105 - possible misuse of relay by kmemtrace
106 - timestamps being collected unorderly
107Or you may fix it yourself and send us a patch.
108---
109
110Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
111A: This is a known issue and I'm working on it. These might be true errors
112in kernel code, which may have inconsistent behavior (e.g. allocating memory
113with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
114out this behavior may work with SLAB, but may fail with other allocators.
115
116It may also be due to lack of tracing in some unusual allocator functions.
117
118We don't want bug reports regarding this issue yet.
119---
120
121V. See also
122===========
123
124Documentation/kernel-parameters.txt
125Documentation/ABI/testing/debugfs-kmemtrace
126
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..5731c67abc55 100644
--- a/Documentation/tracers/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
diff --git a/Documentation/tracepoints.txt b/Documentation/trace/tracepoints.txt
index 6f0a044f5b5e..c0e1ceed75a4 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/trace/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
45#include <linux/tracepoint.h> 45#include <linux/tracepoint.h>
46 46
47DECLARE_TRACE(subsys_eventname, 47DECLARE_TRACE(subsys_eventname,
48 TPPROTO(int firstarg, struct task_struct *p), 48 TP_PROTO(int firstarg, struct task_struct *p),
49 TPARGS(firstarg, p)); 49 TP_ARGS(firstarg, p));
50 50
51In subsys/file.c (where the tracing statement must be added) : 51In subsys/file.c (where the tracing statement must be added) :
52 52
@@ -66,10 +66,10 @@ Where :
66 - subsys is the name of your subsystem. 66 - subsys is the name of your subsystem.
67 - eventname is the name of the event to trace. 67 - eventname is the name of the event to trace.
68 68
69- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the 69- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
70 function called by this tracepoint. 70 function called by this tracepoint.
71 71
72- TPARGS(firstarg, p) are the parameters names, same as found in the 72- TP_ARGS(firstarg, p) are the parameters names, same as found in the
73 prototype. 73 prototype.
74 74
75Connecting a function (probe) to a tracepoint is done by providing a 75Connecting a function (probe) to a tracepoint is done by providing a
@@ -103,13 +103,14 @@ used to export the defined tracepoints.
103 103
104* Probe / tracepoint example 104* Probe / tracepoint example
105 105
106See the example provided in samples/tracepoints/src 106See the example provided in samples/tracepoints
107 107
108Compile them with your kernel. 108Compile them with your kernel. They are built during 'make' (not
109'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m.
109 110
110Run, as root : 111Run, as root :
111modprobe tracepoint-example (insmod order is not important) 112modprobe tracepoint-sample (insmod order is not important)
112modprobe tracepoint-probe-example 113modprobe tracepoint-probe-sample
113cat /proc/tracepoint-example (returns an expected error) 114cat /proc/tracepoint-sample (returns an expected error)
114rmmod tracepoint-example tracepoint-probe-example 115rmmod tracepoint-sample tracepoint-probe-sample
115dmesg 116dmesg
diff --git a/Documentation/usb/usbmon.txt b/Documentation/usb/usbmon.txt
index 270481906dc8..6c3c625b7f30 100644
--- a/Documentation/usb/usbmon.txt
+++ b/Documentation/usb/usbmon.txt
@@ -229,16 +229,26 @@ struct usbmon_packet {
229 int status; /* 28: */ 229 int status; /* 28: */
230 unsigned int length; /* 32: Length of data (submitted or actual) */ 230 unsigned int length; /* 32: Length of data (submitted or actual) */
231 unsigned int len_cap; /* 36: Delivered length */ 231 unsigned int len_cap; /* 36: Delivered length */
232 unsigned char setup[8]; /* 40: Only for Control 'S' */ 232 union { /* 40: */
233}; /* 48 bytes total */ 233 unsigned char setup[SETUP_LEN]; /* Only for Control S-type */
234 struct iso_rec { /* Only for ISO */
235 int error_count;
236 int numdesc;
237 } iso;
238 } s;
239 int interval; /* 48: Only for Interrupt and ISO */
240 int start_frame; /* 52: For ISO */
241 unsigned int xfer_flags; /* 56: copy of URB's transfer_flags */
242 unsigned int ndesc; /* 60: Actual number of ISO descriptors */
243}; /* 64 total length */
234 244
235These events can be received from a character device by reading with read(2), 245These events can be received from a character device by reading with read(2),
236with an ioctl(2), or by accessing the buffer with mmap. 246with an ioctl(2), or by accessing the buffer with mmap. However, read(2)
247only returns first 48 bytes for compatibility reasons.
237 248
238The character device is usually called /dev/usbmonN, where N is the USB bus 249The character device is usually called /dev/usbmonN, where N is the USB bus
239number. Number zero (/dev/usbmon0) is special and means "all buses". 250number. Number zero (/dev/usbmon0) is special and means "all buses".
240However, this feature is not implemented yet. Note that specific naming 251Note that specific naming policy is set by your Linux distribution.
241policy is set by your Linux distribution.
242 252
243If you create /dev/usbmon0 by hand, make sure that it is owned by root 253If you create /dev/usbmon0 by hand, make sure that it is owned by root
244and has mode 0600. Otherwise, unpriviledged users will be able to snoop 254and has mode 0600. Otherwise, unpriviledged users will be able to snoop
@@ -279,9 +289,10 @@ size is out of [unspecified] bounds for this kernel, the call fails with
279This call returns the current size of the buffer in bytes. 289This call returns the current size of the buffer in bytes.
280 290
281 MON_IOCX_GET, defined as _IOW(MON_IOC_MAGIC, 6, struct mon_get_arg) 291 MON_IOCX_GET, defined as _IOW(MON_IOC_MAGIC, 6, struct mon_get_arg)
292 MON_IOCX_GETX, defined as _IOW(MON_IOC_MAGIC, 10, struct mon_get_arg)
282 293
283This call waits for events to arrive if none were in the kernel buffer, 294These calls wait for events to arrive if none were in the kernel buffer,
284then returns the first event. Its argument is a pointer to the following 295then return the first event. The argument is a pointer to the following
285structure: 296structure:
286 297
287struct mon_get_arg { 298struct mon_get_arg {
@@ -294,6 +305,8 @@ Before the call, hdr, data, and alloc should be filled. Upon return, the area
294pointed by hdr contains the next event structure, and the data buffer contains 305pointed by hdr contains the next event structure, and the data buffer contains
295the data, if any. The event is removed from the kernel buffer. 306the data, if any. The event is removed from the kernel buffer.
296 307
308The MON_IOCX_GET copies 48 bytes, MON_IOCX_GETX copies 64 bytes.
309
297 MON_IOCX_MFETCH, defined as _IOWR(MON_IOC_MAGIC, 7, struct mon_mfetch_arg) 310 MON_IOCX_MFETCH, defined as _IOWR(MON_IOC_MAGIC, 7, struct mon_mfetch_arg)
298 311
299This ioctl is primarily used when the application accesses the buffer 312This ioctl is primarily used when the application accesses the buffer
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
index 0d93fa1ac25e..f11c583295e9 100644
--- a/Documentation/video4linux/CARDLIST.bttv
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -135,7 +135,7 @@
135134 -> Adlink RTV24 135134 -> Adlink RTV24
136135 -> DViCO FusionHDTV 5 Lite [18ac:d500] 136135 -> DViCO FusionHDTV 5 Lite [18ac:d500]
137136 -> Acorp Y878F [9511:1540] 137136 -> Acorp Y878F [9511:1540]
138137 -> Conceptronic CTVFMi v2 138137 -> Conceptronic CTVFMi v2 [036e:109e]
139138 -> Prolink Pixelview PV-BT878P+ (Rev.2E) 139138 -> Prolink Pixelview PV-BT878P+ (Rev.2E)
140139 -> Prolink PixelView PlayTV MPEG2 PV-M4900 140139 -> Prolink PixelView PlayTV MPEG2 PV-M4900
141140 -> Osprey 440 [0070:ff07] 141140 -> Osprey 440 [0070:ff07]
@@ -154,3 +154,7 @@
154153 -> PHYTEC VD-012 (bt878) 154153 -> PHYTEC VD-012 (bt878)
155154 -> PHYTEC VD-012-X1 (bt878) 155154 -> PHYTEC VD-012-X1 (bt878)
156155 -> PHYTEC VD-012-X2 (bt878) 156155 -> PHYTEC VD-012-X2 (bt878)
157156 -> IVCE-8784 [0000:f050,0001:f050,0002:f050,0003:f050]
158157 -> Geovision GV-800(S) (master) [800a:763d]
159158 -> Geovision GV-800(S) (slave) [800b:763d,800c:763d,800d:763d]
160159 -> ProVideo PV183 [1830:1540,1831:1540,1832:1540,1833:1540,1834:1540,1835:1540,1836:1540,1837:1540]
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 35ea130e9898..91aa3c0f0dd2 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -12,3 +12,7 @@
12 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78] 12 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78]
13 12 -> Leadtek Winfast PxDVR3200 H [107d:6681] 13 12 -> Leadtek Winfast PxDVR3200 H [107d:6681]
14 13 -> Compro VideoMate E650F [185b:e800] 14 13 -> Compro VideoMate E650F [185b:e800]
15 14 -> TurboSight TBS 6920 [6920:8888]
16 15 -> TeVii S470 [d470:9022]
17 16 -> DVBWorld DVB-S2 2005 [0001:2005]
18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 0d08f1edcf6d..71e9db0b26f7 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -77,3 +77,4 @@
77 76 -> SATTRADE ST4200 DVB-S/S2 [b200:4200] 77 76 -> SATTRADE ST4200 DVB-S/S2 [b200:4200]
78 77 -> TBS 8910 DVB-S [8910:8888] 78 77 -> TBS 8910 DVB-S [8910:8888]
79 78 -> Prof 6200 DVB-S [b022:3022] 79 78 -> Prof 6200 DVB-S [b022:3022]
80 79 -> Terratec Cinergy HT PCI MKII [153b:1177]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 75bded8a4aa2..78d0a6eed571 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -7,12 +7,12 @@
7 6 -> Terratec Cinergy 200 USB (em2800) 7 6 -> Terratec Cinergy 200 USB (em2800)
8 7 -> Leadtek Winfast USB II (em2800) [0413:6023] 8 7 -> Leadtek Winfast USB II (em2800) [0413:6023]
9 8 -> Kworld USB2800 (em2800) 9 8 -> Kworld USB2800 (em2800)
10 9 -> Pinnacle Dazzle DVC 90/DVC 100 (em2820/em2840) [2304:0207,2304:021a] 10 9 -> Pinnacle Dazzle DVC 90/100/101/107 / Kaiser Baas Video to DVD maker (em2820/em2840) [1b80:e302,2304:0207,2304:021a]
11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500] 11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500]
12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042] 12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042]
13 12 -> Kworld PVR TV 2800 RF (em2820/em2840) 13 12 -> Kworld PVR TV 2800 RF (em2820/em2840)
14 13 -> Terratec Prodigy XS (em2880) [0ccd:0047] 14 13 -> Terratec Prodigy XS (em2880) [0ccd:0047]
15 14 -> Pixelview Prolink PlayTV USB 2.0 (em2820/em2840) 15 14 -> SIIG AVTuner-PVR / Pixelview Prolink PlayTV USB 2.0 (em2820/em2840)
16 15 -> V-Gear PocketTV (em2800) 16 15 -> V-Gear PocketTV (em2800)
17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b]
18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
@@ -30,7 +30,6 @@
30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) 30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840)
31 31 -> Usbgear VD204v9 (em2821) 31 31 -> Usbgear VD204v9 (em2821)
32 32 -> Supercomp USB 2.0 TV (em2821) 32 32 -> Supercomp USB 2.0 TV (em2821)
33 33 -> SIIG AVTuner-PVR/Prolink PlayTV USB 2.0 (em2821)
34 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f] 33 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f]
35 35 -> Typhoon DVD Maker (em2860) 34 35 -> Typhoon DVD Maker (em2860)
36 36 -> NetGMBH Cam (em2860) 35 36 -> NetGMBH Cam (em2860)
@@ -58,3 +57,7 @@
58 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041] 57 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041]
59 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f] 58 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f]
60 61 -> Pixelview PlayTV Box 4 USB 2.0 (em2820/em2840) 59 61 -> Pixelview PlayTV Box 4 USB 2.0 (em2820/em2840)
60 62 -> Gadmei TVR200 (em2820/em2840)
61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303]
62 64 -> Easy Cap Capture DC-60 (em2860)
63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515]
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index b8d470596b0c..6dacf2825259 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -153,3 +153,5 @@
153152 -> Asus Tiger Rev:1.00 [1043:4857] 153152 -> Asus Tiger Rev:1.00 [1043:4857]
154153 -> Kworld Plus TV Analog Lite PCI [17de:7128] 154153 -> Kworld Plus TV Analog Lite PCI [17de:7128]
155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] 155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d]
156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708]
157156 -> Hauppauge WinTV-HVR1110r3 [0070:6707,0070:6709,0070:670a]
diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran
index 295462b2317a..0e89e7676298 100644
--- a/Documentation/video4linux/Zoran
+++ b/Documentation/video4linux/Zoran
@@ -401,8 +401,7 @@ Additional notes for software developers:
401 first set the correct norm. Well, it seems logically correct: TV 401 first set the correct norm. Well, it seems logically correct: TV
402 standard is "more constant" for current country than geometry 402 standard is "more constant" for current country than geometry
403 settings of a variety of TV capture cards which may work in ITU or 403 settings of a variety of TV capture cards which may work in ITU or
404 square pixel format. Remember that users now can lock the norm to 404 square pixel format.
405 avoid any ambiguity.
406-- 405--
407Please note that lavplay/lavrec are also included in the MJPEG-tools 406Please note that lavplay/lavrec are also included in the MJPEG-tools
408(http://mjpeg.sf.net/). 407(http://mjpeg.sf.net/).
diff --git a/Documentation/video4linux/bttv/Insmod-options b/Documentation/video4linux/bttv/Insmod-options
index 5ef75787f83a..bbe3ed667d91 100644
--- a/Documentation/video4linux/bttv/Insmod-options
+++ b/Documentation/video4linux/bttv/Insmod-options
@@ -81,16 +81,6 @@ tuner.o
81 pal=[bdgil] select PAL variant (used for some tuners 81 pal=[bdgil] select PAL variant (used for some tuners
82 only, important for the audio carrier). 82 only, important for the audio carrier).
83 83
84tvmixer.o
85 registers a mixer device for the TV card's volume/bass/treble
86 controls (requires a i2c audio control chip like the msp3400).
87
88 insmod args:
89 debug=1 print some debug info to the syslog.
90 devnr=n allocate device #n (0 == /dev/mixer,
91 1 = /dev/mixer1, ...), default is to
92 use the first free one.
93
94tvaudio.o 84tvaudio.o
95 new, experimental module which is supported to provide a single 85 new, experimental module which is supported to provide a single
96 driver for all simple i2c audio control chips (tda/tea*). 86 driver for all simple i2c audio control chips (tda/tea*).
diff --git a/Documentation/video4linux/bttv/README b/Documentation/video4linux/bttv/README
index 7ca2154c2bf5..3a367cdb664e 100644
--- a/Documentation/video4linux/bttv/README
+++ b/Documentation/video4linux/bttv/README
@@ -63,8 +63,8 @@ If you have some knowledge and spare time, please try to fix this
63yourself (patches very welcome of course...) You know: The linux 63yourself (patches very welcome of course...) You know: The linux
64slogan is "Do it yourself". 64slogan is "Do it yourself".
65 65
66There is a mailing list: video4linux-list@redhat.com. 66There is a mailing list: linux-media@vger.kernel.org
67https://listman.redhat.com/mailman/listinfo/video4linux-list 67http://vger.kernel.org/vger-lists.html#linux-media
68 68
69If you have trouble with some specific TV card, try to ask there 69If you have trouble with some specific TV card, try to ask there
70instead of mailing me directly. The chance that someone with the 70instead of mailing me directly. The chance that someone with the
diff --git a/Documentation/video4linux/cx2341x/README.hm12 b/Documentation/video4linux/cx2341x/README.hm12
index 0e213ed095e6..b36148ea0750 100644
--- a/Documentation/video4linux/cx2341x/README.hm12
+++ b/Documentation/video4linux/cx2341x/README.hm12
@@ -32,6 +32,10 @@ Y, U and V planes. This code assumes frames of 720x576 (PAL) pixels.
32The width of a frame is always 720 pixels, regardless of the actual specified 32The width of a frame is always 720 pixels, regardless of the actual specified
33width. 33width.
34 34
35If the height is not a multiple of 32 lines, then the captured video is
36missing macroblocks at the end and is unusable. So the height must be a
37multiple of 32.
38
35-------------------------------------------------------------------------- 39--------------------------------------------------------------------------
36 40
37#include <stdio.h> 41#include <stdio.h>
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 1c58a7630146..98529e03a46e 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -32,6 +32,7 @@ spca561 041e:403b Creative Webcam Vista (VF0010)
32zc3xx 041e:4051 Creative Live!Cam Notebook Pro (VF0250) 32zc3xx 041e:4051 Creative Live!Cam Notebook Pro (VF0250)
33ov519 041e:4052 Creative Live! VISTA IM 33ov519 041e:4052 Creative Live! VISTA IM
34zc3xx 041e:4053 Creative Live!Cam Video IM 34zc3xx 041e:4053 Creative Live!Cam Video IM
35vc032x 041e:405b Creative Live! Cam Notebook Ultra (VC0130)
35ov519 041e:405f Creative Live! VISTA VF0330 36ov519 041e:405f Creative Live! VISTA VF0330
36ov519 041e:4060 Creative Live! VISTA VF0350 37ov519 041e:4060 Creative Live! VISTA VF0350
37ov519 041e:4061 Creative Live! VISTA VF0400 38ov519 041e:4061 Creative Live! VISTA VF0400
@@ -193,6 +194,7 @@ spca500 084d:0003 D-Link DSC-350
193spca500 08ca:0103 Aiptek PocketDV 194spca500 08ca:0103 Aiptek PocketDV
194sunplus 08ca:0104 Aiptek PocketDVII 1.3 195sunplus 08ca:0104 Aiptek PocketDVII 1.3
195sunplus 08ca:0106 Aiptek Pocket DV3100+ 196sunplus 08ca:0106 Aiptek Pocket DV3100+
197mr97310a 08ca:0111 Aiptek PenCam VGA+
196sunplus 08ca:2008 Aiptek Mini PenCam 2 M 198sunplus 08ca:2008 Aiptek Mini PenCam 2 M
197sunplus 08ca:2010 Aiptek PocketCam 3M 199sunplus 08ca:2010 Aiptek PocketCam 3M
198sunplus 08ca:2016 Aiptek PocketCam 2 Mega 200sunplus 08ca:2016 Aiptek PocketCam 2 Mega
@@ -215,6 +217,7 @@ pac207 093a:2468 PAC207
215pac207 093a:2470 Genius GF112 217pac207 093a:2470 Genius GF112
216pac207 093a:2471 Genius VideoCam ge111 218pac207 093a:2471 Genius VideoCam ge111
217pac207 093a:2472 Genius VideoCam ge110 219pac207 093a:2472 Genius VideoCam ge110
220pac207 093a:2474 Genius iLook 111
218pac207 093a:2476 Genius e-Messenger 112 221pac207 093a:2476 Genius e-Messenger 112
219pac7311 093a:2600 PAC7311 Typhoon 222pac7311 093a:2600 PAC7311 Typhoon
220pac7311 093a:2601 Philips SPC 610 NC 223pac7311 093a:2601 Philips SPC 610 NC
@@ -279,6 +282,7 @@ spca561 10fd:7e50 FlyCam Usb 100
279zc3xx 10fd:8050 Typhoon Webshot II USB 300k 282zc3xx 10fd:8050 Typhoon Webshot II USB 300k
280ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201) 283ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201)
281pac207 145f:013a Trust WB-1300N 284pac207 145f:013a Trust WB-1300N
285vc032x 15b8:6001 HP 2.0 Megapixel
282vc032x 15b8:6002 HP 2.0 Megapixel rz406aa 286vc032x 15b8:6002 HP 2.0 Megapixel rz406aa
283spca501 1776:501c Arowana 300K CMOS Camera 287spca501 1776:501c Arowana 300K CMOS Camera
284t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops 288t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt
new file mode 100644
index 000000000000..b1137f9a53eb
--- /dev/null
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -0,0 +1,125 @@
1 PXA-Camera Host Driver
2 ======================
3
4Constraints
5-----------
6 a) Image size for YUV422P format
7 All YUV422P images are enforced to have width x height % 16 = 0.
8 This is due to DMA constraints, which transfers only planes of 8 byte
9 multiples.
10
11
12Global video workflow
13---------------------
14 a) QCI stopped
15 Initialy, the QCI interface is stopped.
16 When a buffer is queued (pxa_videobuf_ops->buf_queue), the QCI starts.
17
18 b) QCI started
19 More buffers can be queued while the QCI is started without halting the
20 capture. The new buffers are "appended" at the tail of the DMA chain, and
21 smoothly captured one frame after the other.
22
23 Once a buffer is filled in the QCI interface, it is marked as "DONE" and
24 removed from the active buffers list. It can be then requeud or dequeued by
25 userland application.
26
27 Once the last buffer is filled in, the QCI interface stops.
28
29
30DMA usage
31---------
32 a) DMA flow
33 - first buffer queued for capture
34 Once a first buffer is queued for capture, the QCI is started, but data
35 transfer is not started. On "End Of Frame" interrupt, the irq handler
36 starts the DMA chain.
37 - capture of one videobuffer
38 The DMA chain starts transfering data into videobuffer RAM pages.
39 When all pages are transfered, the DMA irq is raised on "ENDINTR" status
40 - finishing one videobuffer
41 The DMA irq handler marks the videobuffer as "done", and removes it from
42 the active running queue
43 Meanwhile, the next videobuffer (if there is one), is transfered by DMA
44 - finishing the last videobuffer
45 On the DMA irq of the last videobuffer, the QCI is stopped.
46
47 b) DMA prepared buffer will have this structure
48
49 +------------+-----+---------------+-----------------+
50 | desc-sg[0] | ... | desc-sg[last] | finisher/linker |
51 +------------+-----+---------------+-----------------+
52
53 This structure is pointed by dma->sg_cpu.
54 The descriptors are used as follows :
55 - desc-sg[i]: i-th descriptor, transfering the i-th sg
56 element to the video buffer scatter gather
57 - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN
58 - linker: has ddadr= desc-sg[0] of next video buffer, dcmd=0
59
60 For the next schema, let's assume d0=desc-sg[0] .. dN=desc-sg[N],
61 "f" stands for finisher and "l" for linker.
62 A typical running chain is :
63
64 Videobuffer 1 Videobuffer 2
65 +---------+----+---+ +----+----+----+---+
66 | d0 | .. | dN | l | | d0 | .. | dN | f |
67 +---------+----+-|-+ ^----+----+----+---+
68 | |
69 +----+
70
71 After the chaining is finished, the chain looks like :
72
73 Videobuffer 1 Videobuffer 2 Videobuffer 3
74 +---------+----+---+ +----+----+----+---+ +----+----+----+---+
75 | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f |
76 +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+
77 | | | |
78 +----+ +----+
79 new_link
80
81 c) DMA hot chaining timeslice issue
82
83 As DMA chaining is done while DMA _is_ running, the linking may be done
84 while the DMA jumps from one Videobuffer to another. On the schema, that
85 would be a problem if the following sequence is encountered :
86
87 - DMA chain is Videobuffer1 + Videobuffer2
88 - pxa_videobuf_queue() is called to queue Videobuffer3
89 - DMA controller finishes Videobuffer2, and DMA stops
90 =>
91 Videobuffer 1 Videobuffer 2
92 +---------+----+---+ +----+----+----+---+
93 | d0 | .. | dN | l | | d0 | .. | dN | f |
94 +---------+----+-|-+ ^----+----+----+-^-+
95 | | |
96 +----+ +-- DMA DDADR loads DDADR_STOP
97
98 - pxa_dma_add_tail_buf() is called, the Videobuffer2 "finisher" is
99 replaced by a "linker" to Videobuffer3 (creation of new_link)
100 - pxa_videobuf_queue() finishes
101 - the DMA irq handler is called, which terminates Videobuffer2
102 - Videobuffer3 capture is not scheduled on DMA chain (as it stopped !!!)
103
104 Videobuffer 1 Videobuffer 2 Videobuffer 3
105 +---------+----+---+ +----+----+----+---+ +----+----+----+---+
106 | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f |
107 +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+
108 | | | |
109 +----+ +----+
110 new_link
111 DMA DDADR still is DDADR_STOP
112
113 - pxa_camera_check_link_miss() is called
114 This checks if the DMA is finished and a buffer is still on the
115 pcdev->capture list. If that's the case, the capture will be restarted,
116 and Videobuffer3 is scheduled on DMA chain.
117 - the DMA irq handler finishes
118
119 Note: if DMA stops just after pxa_camera_check_link_miss() reads DDADR()
120 value, we have the guarantee that the DMA irq handler will be called back
121 when the DMA will finish the buffer, and pxa_camera_check_link_miss() will
122 be called again, to reschedule Videobuffer3.
123
124--
125Author: Robert Jarzmik <robert.jarzmik@free.fr>
diff --git a/Documentation/video4linux/si470x.txt b/Documentation/video4linux/si470x.txt
index 49679e6aaa76..3a7823e01b4d 100644
--- a/Documentation/video4linux/si470x.txt
+++ b/Documentation/video4linux/si470x.txt
@@ -1,6 +1,6 @@
1Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers 1Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers
2 2
3Copyright (c) 2008 Tobias Lorenz <tobias.lorenz@gmx.net> 3Copyright (c) 2009 Tobias Lorenz <tobias.lorenz@gmx.net>
4 4
5 5
6Information from Silicon Labs 6Information from Silicon Labs
@@ -41,7 +41,7 @@ chips are known to work:
41- 10c4:818a: Silicon Labs USB FM Radio Reference Design 41- 10c4:818a: Silicon Labs USB FM Radio Reference Design
42- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF) 42- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF)
43- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700) 43- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700)
44- 10c5:819a: DealExtreme USB Radio 44- 10c5:819a: Sanei Electric, Inc. FM USB Radio (sold as DealExtreme.com PCear)
45 45
46 46
47Software 47Software
@@ -52,6 +52,7 @@ Testing is usually done with most application under Debian/testing:
52- gradio - GTK FM radio tuner 52- gradio - GTK FM radio tuner
53- kradio - Comfortable Radio Application for KDE 53- kradio - Comfortable Radio Application for KDE
54- radio - ncurses-based radio application 54- radio - ncurses-based radio application
55- mplayer - The Ultimate Movie Player For Linux
55 56
56There is also a library libv4l, which can be used. It's going to have a function 57There is also a library libv4l, which can be used. It's going to have a function
57for frequency seeking, either by using hardware functionality as in radio-si470x 58for frequency seeking, either by using hardware functionality as in radio-si470x
@@ -69,7 +70,7 @@ Audio Listing
69USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to 70USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to
70also select SND_USB_AUDIO, as this is required to get sound from the radio. For 71also select SND_USB_AUDIO, as this is required to get sound from the radio. For
71listing you have to redirect the sound, for example using one of the following 72listing you have to redirect the sound, for example using one of the following
72commands. 73commands. Please adjust the audio devices to your needs (/dev/dsp* and hw:x,x).
73 74
74If you just want to test audio (very poor quality): 75If you just want to test audio (very poor quality):
75cat /dev/dsp1 > /dev/dsp 76cat /dev/dsp1 > /dev/dsp
@@ -80,6 +81,10 @@ sox -2 --endian little -r 96000 -t oss /dev/dsp1 -t oss /dev/dsp
80If you use arts try: 81If you use arts try:
81arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B - 82arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B -
82 83
84If you use mplayer try:
85mplayer -radio adevice=hw=1.0:arate=96000 \
86 -rawaudio rate=96000 \
87 radio://<frequency>/capture
83 88
84Module Parameters 89Module Parameters
85================= 90=================
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index ff124374e9ba..854808b67fae 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -47,7 +47,9 @@ All drivers have the following structure:
473) Creating V4L2 device nodes (/dev/videoX, /dev/vbiX, /dev/radioX and 473) Creating V4L2 device nodes (/dev/videoX, /dev/vbiX, /dev/radioX and
48 /dev/vtxX) and keeping track of device-node specific data. 48 /dev/vtxX) and keeping track of device-node specific data.
49 49
504) Filehandle-specific structs containing per-filehandle data. 504) Filehandle-specific structs containing per-filehandle data;
51
525) video buffer handling.
51 53
52This is a rough schematic of how it all relates: 54This is a rough schematic of how it all relates:
53 55
@@ -82,12 +84,20 @@ You must register the device instance:
82 v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); 84 v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev);
83 85
84Registration will initialize the v4l2_device struct and link dev->driver_data 86Registration will initialize the v4l2_device struct and link dev->driver_data
85to v4l2_dev. Registration will also set v4l2_dev->name to a value derived from 87to v4l2_dev. If v4l2_dev->name is empty then it will be set to a value derived
86dev (driver name followed by the bus_id, to be precise). You may change the 88from dev (driver name followed by the bus_id, to be precise). If you set it
87name after registration if you want. 89up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
88 91
89The first 'dev' argument is normally the struct device pointer of a pci_dev, 92The first 'dev' argument is normally the struct device pointer of a pci_dev,
90usb_device or platform_device. 93usb_interface or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making
95it impossible to associate v4l2_dev with a particular parent.
96
97You can also supply a notify() callback that can be called by sub-devices to
98notify you of events. Whether you need to set this depends on the sub-device.
99Any notifications a sub-device supports must be defined in a header in
100include/media/<subdevice>.h.
91 101
92You unregister with: 102You unregister with:
93 103
@@ -95,6 +105,17 @@ You unregister with:
95 105
96Unregistering will also automatically unregister all subdevs from the device. 106Unregistering will also automatically unregister all subdevs from the device.
97 107
108If you have a hotpluggable device (e.g. a USB device), then when a disconnect
109happens the parent device becomes invalid. Since v4l2_device has a pointer to
110that parent device it has to be cleared as well to mark that the parent is
111gone. To do this call:
112
113 v4l2_device_disconnect(struct v4l2_device *v4l2_dev);
114
115This does *not* unregister the subdevs, so you still need to call the
116v4l2_device_unregister() function for that. If your driver is not hotpluggable,
117then there is no need to call v4l2_device_disconnect().
118
98Sometimes you need to iterate over all devices registered by a specific 119Sometimes you need to iterate over all devices registered by a specific
99driver. This is usually the case if multiple device drivers use the same 120driver. This is usually the case if multiple device drivers use the same
100hardware. E.g. the ivtvfb driver is a framebuffer driver that uses the ivtv 121hardware. E.g. the ivtvfb driver is a framebuffer driver that uses the ivtv
@@ -134,7 +155,7 @@ The recommended approach is as follows:
134 155
135static atomic_t drv_instance = ATOMIC_INIT(0); 156static atomic_t drv_instance = ATOMIC_INIT(0);
136 157
137static int __devinit drv_probe(struct pci_dev *dev, 158static int __devinit drv_probe(struct pci_dev *pdev,
138 const struct pci_device_id *pci_id) 159 const struct pci_device_id *pci_id)
139{ 160{
140 ... 161 ...
@@ -218,7 +239,7 @@ to add new ops and categories.
218 239
219A sub-device driver initializes the v4l2_subdev struct using: 240A sub-device driver initializes the v4l2_subdev struct using:
220 241
221 v4l2_subdev_init(subdev, &ops); 242 v4l2_subdev_init(sd, &ops);
222 243
223Afterwards you need to initialize subdev->name with a unique name and set the 244Afterwards you need to initialize subdev->name with a unique name and set the
224module owner. This is done for you if you use the i2c helper functions. 245module owner. This is done for you if you use the i2c helper functions.
@@ -226,7 +247,7 @@ module owner. This is done for you if you use the i2c helper functions.
226A device (bridge) driver needs to register the v4l2_subdev with the 247A device (bridge) driver needs to register the v4l2_subdev with the
227v4l2_device: 248v4l2_device:
228 249
229 int err = v4l2_device_register_subdev(device, subdev); 250 int err = v4l2_device_register_subdev(v4l2_dev, sd);
230 251
231This can fail if the subdev module disappeared before it could be registered. 252This can fail if the subdev module disappeared before it could be registered.
232After this function was called successfully the subdev->dev field points to 253After this function was called successfully the subdev->dev field points to
@@ -234,17 +255,17 @@ the v4l2_device.
234 255
235You can unregister a sub-device using: 256You can unregister a sub-device using:
236 257
237 v4l2_device_unregister_subdev(subdev); 258 v4l2_device_unregister_subdev(sd);
238 259
239Afterwards the subdev module can be unloaded and subdev->dev == NULL. 260Afterwards the subdev module can be unloaded and sd->dev == NULL.
240 261
241You can call an ops function either directly: 262You can call an ops function either directly:
242 263
243 err = subdev->ops->core->g_chip_ident(subdev, &chip); 264 err = sd->ops->core->g_chip_ident(sd, &chip);
244 265
245but it is better and easier to use this macro: 266but it is better and easier to use this macro:
246 267
247 err = v4l2_subdev_call(subdev, core, g_chip_ident, &chip); 268 err = v4l2_subdev_call(sd, core, g_chip_ident, &chip);
248 269
249The macro will to the right NULL pointer checks and returns -ENODEV if subdev 270The macro will to the right NULL pointer checks and returns -ENODEV if subdev
250is NULL, -ENOIOCTLCMD if either subdev->core or subdev->core->g_chip_ident is 271is NULL, -ENOIOCTLCMD if either subdev->core or subdev->core->g_chip_ident is
@@ -252,19 +273,19 @@ NULL, or the actual result of the subdev->ops->core->g_chip_ident ops.
252 273
253It is also possible to call all or a subset of the sub-devices: 274It is also possible to call all or a subset of the sub-devices:
254 275
255 v4l2_device_call_all(dev, 0, core, g_chip_ident, &chip); 276 v4l2_device_call_all(v4l2_dev, 0, core, g_chip_ident, &chip);
256 277
257Any subdev that does not support this ops is skipped and error results are 278Any subdev that does not support this ops is skipped and error results are
258ignored. If you want to check for errors use this: 279ignored. If you want to check for errors use this:
259 280
260 err = v4l2_device_call_until_err(dev, 0, core, g_chip_ident, &chip); 281 err = v4l2_device_call_until_err(v4l2_dev, 0, core, g_chip_ident, &chip);
261 282
262Any error except -ENOIOCTLCMD will exit the loop with that error. If no 283Any error except -ENOIOCTLCMD will exit the loop with that error. If no
263errors (except -ENOIOCTLCMD) occured, then 0 is returned. 284errors (except -ENOIOCTLCMD) occured, then 0 is returned.
264 285
265The second argument to both calls is a group ID. If 0, then all subdevs are 286The second argument to both calls is a group ID. If 0, then all subdevs are
266called. If non-zero, then only those whose group ID match that value will 287called. If non-zero, then only those whose group ID match that value will
267be called. Before a bridge driver registers a subdev it can set subdev->grp_id 288be called. Before a bridge driver registers a subdev it can set sd->grp_id
268to whatever value it wants (it's 0 by default). This value is owned by the 289to whatever value it wants (it's 0 by default). This value is owned by the
269bridge driver and the sub-device driver will never modify or use it. 290bridge driver and the sub-device driver will never modify or use it.
270 291
@@ -276,6 +297,11 @@ e.g. AUDIO_CONTROLLER and specify that as the group ID value when calling
276v4l2_device_call_all(). That ensures that it will only go to the subdev 297v4l2_device_call_all(). That ensures that it will only go to the subdev
277that needs it. 298that needs it.
278 299
300If the sub-device needs to notify its v4l2_device parent of an event, then
301it can call v4l2_subdev_notify(sd, notification, arg). This macro checks
302whether there is a notify() callback defined and returns -ENODEV if not.
303Otherwise the result of the notify() call is returned.
304
279The advantage of using v4l2_subdev is that it is a generic struct and does 305The advantage of using v4l2_subdev is that it is a generic struct and does
280not contain any knowledge about the underlying hardware. So a driver might 306not contain any knowledge about the underlying hardware. So a driver might
281contain several subdevs that use an I2C bus, but also a subdev that is 307contain several subdevs that use an I2C bus, but also a subdev that is
@@ -325,32 +351,25 @@ And this to go from an i2c_client to a v4l2_subdev struct:
325 351
326 struct v4l2_subdev *sd = i2c_get_clientdata(client); 352 struct v4l2_subdev *sd = i2c_get_clientdata(client);
327 353
328Finally you need to make a command function to make driver->command()
329call the right subdev_ops functions:
330
331static int subdev_command(struct i2c_client *client, unsigned cmd, void *arg)
332{
333 return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg);
334}
335
336If driver->command is never used then you can leave this out. Eventually the
337driver->command usage should be removed from v4l.
338
339Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback 354Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback
340is called. This will unregister the sub-device from the bridge driver. It is 355is called. This will unregister the sub-device from the bridge driver. It is
341safe to call this even if the sub-device was never registered. 356safe to call this even if the sub-device was never registered.
342 357
358You need to do this because when the bridge driver destroys the i2c adapter
359the remove() callbacks are called of the i2c devices on that adapter.
360After that the corresponding v4l2_subdev structures are invalid, so they
361have to be unregistered first. Calling v4l2_device_unregister_subdev(sd)
362from the remove() callback ensures that this is always done correctly.
363
343 364
344The bridge driver also has some helper functions it can use: 365The bridge driver also has some helper functions it can use:
345 366
346struct v4l2_subdev *sd = v4l2_i2c_new_subdev(adapter, "module_foo", "chipid", 0x36); 367struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter,
368 "module_foo", "chipid", 0x36);
347 369
348This loads the given module (can be NULL if no module needs to be loaded) and 370This loads the given module (can be NULL if no module needs to be loaded) and
349calls i2c_new_device() with the given i2c_adapter and chip/address arguments. 371calls i2c_new_device() with the given i2c_adapter and chip/address arguments.
350If all goes well, then it registers the subdev with the v4l2_device. It gets 372If all goes well, then it registers the subdev with the v4l2_device.
351the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure
352that adapdata is set to v4l2_device when you setup the i2c_adapter in your
353driver.
354 373
355You can also use v4l2_i2c_new_probed_subdev() which is very similar to 374You can also use v4l2_i2c_new_probed_subdev() which is very similar to
356v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses 375v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses
@@ -358,6 +377,14 @@ that it should probe. Internally it calls i2c_new_probed_device().
358 377
359Both functions return NULL if something went wrong. 378Both functions return NULL if something went wrong.
360 379
380Note that the chipid you pass to v4l2_i2c_new_(probed_)subdev() is usually
381the same as the module name. It allows you to specify a chip variant, e.g.
382"saa7114" or "saa7115". In general though the i2c driver autodetects this.
383The use of chipid is something that needs to be looked at more closely at a
384later date. It differs between i2c drivers and as such can be confusing.
385To see which chip variants are supported you can look in the i2c driver code
386for the i2c_device_id table. This lists all the possibilities.
387
361 388
362struct video_device 389struct video_device
363------------------- 390-------------------
@@ -396,6 +423,15 @@ You should also set these fields:
396- ioctl_ops: if you use the v4l2_ioctl_ops to simplify ioctl maintenance 423- ioctl_ops: if you use the v4l2_ioctl_ops to simplify ioctl maintenance
397 (highly recommended to use this and it might become compulsory in the 424 (highly recommended to use this and it might become compulsory in the
398 future!), then set this to your v4l2_ioctl_ops struct. 425 future!), then set this to your v4l2_ioctl_ops struct.
426- parent: you only set this if v4l2_device was registered with NULL as
427 the parent device struct. This only happens in cases where one hardware
428 device has multiple PCI devices that all share the same v4l2_device core.
429
430 The cx88 driver is an example of this: one core v4l2_device struct, but
431 it is used by both an raw video PCI device (cx8800) and a MPEG PCI device
432 (cx8802). Since the v4l2_device cannot be associated with a particular
433 PCI device it is setup without a parent device. But when the struct
434 video_device is setup you do know which parent PCI device to use.
399 435
400If you use v4l2_ioctl_ops, then you should set either .unlocked_ioctl or 436If you use v4l2_ioctl_ops, then you should set either .unlocked_ioctl or
401.ioctl to video_ioctl2 in your v4l2_file_operations struct. 437.ioctl to video_ioctl2 in your v4l2_file_operations struct.
@@ -499,8 +535,8 @@ There are a few useful helper functions:
499 535
500You can set/get driver private data in the video_device struct using: 536You can set/get driver private data in the video_device struct using:
501 537
502void *video_get_drvdata(struct video_device *dev); 538void *video_get_drvdata(struct video_device *vdev);
503void video_set_drvdata(struct video_device *dev, void *data); 539void video_set_drvdata(struct video_device *vdev, void *data);
504 540
505Note that you can safely call video_set_drvdata() before calling 541Note that you can safely call video_set_drvdata() before calling
506video_register_device(). 542video_register_device().
@@ -519,3 +555,103 @@ void *video_drvdata(struct file *file);
519You can go from a video_device struct to the v4l2_device struct using: 555You can go from a video_device struct to the v4l2_device struct using:
520 556
521struct v4l2_device *v4l2_dev = vdev->v4l2_dev; 557struct v4l2_device *v4l2_dev = vdev->v4l2_dev;
558
559video buffer helper functions
560-----------------------------
561
562The v4l2 core API provides a standard method for dealing with video
563buffers. Those methods allow a driver to implement read(), mmap() and
564overlay() on a consistent way.
565
566There are currently methods for using video buffers on devices that
567supports DMA with scatter/gather method (videobuf-dma-sg), DMA with
568linear access (videobuf-dma-contig), and vmalloced buffers, mostly
569used on USB drivers (videobuf-vmalloc).
570
571Any driver using videobuf should provide operations (callbacks) for
572four handlers:
573
574ops->buf_setup - calculates the size of the video buffers and avoid they
575 to waste more than some maximum limit of RAM;
576ops->buf_prepare - fills the video buffer structs and calls
577 videobuf_iolock() to alloc and prepare mmaped memory;
578ops->buf_queue - advices the driver that another buffer were
579 requested (by read() or by QBUF);
580ops->buf_release - frees any buffer that were allocated.
581
582In order to use it, the driver need to have a code (generally called at
583interrupt context) that will properly handle the buffer request lists,
584announcing that a new buffer were filled.
585
586The irq handling code should handle the videobuf task lists, in order
587to advice videobuf that a new frame were filled, in order to honor to a
588request. The code is generally like this one:
589 if (list_empty(&dma_q->active))
590 return;
591
592 buf = list_entry(dma_q->active.next, struct vbuffer, vb.queue);
593
594 if (!waitqueue_active(&buf->vb.done))
595 return;
596
597 /* Some logic to handle the buf may be needed here */
598
599 list_del(&buf->vb.queue);
600 do_gettimeofday(&buf->vb.ts);
601 wake_up(&buf->vb.done);
602
603Those are the videobuffer functions used on drivers, implemented on
604videobuf-core:
605
606- Videobuf init functions
607 videobuf_queue_sg_init()
608 Initializes the videobuf infrastructure. This function should be
609 called before any other videobuf function on drivers that uses DMA
610 Scatter/Gather buffers.
611
612 videobuf_queue_dma_contig_init
613 Initializes the videobuf infrastructure. This function should be
614 called before any other videobuf function on drivers that need DMA
615 contiguous buffers.
616
617 videobuf_queue_vmalloc_init()
618 Initializes the videobuf infrastructure. This function should be
619 called before any other videobuf function on USB (and other drivers)
620 that need a vmalloced type of videobuf.
621
622- videobuf_iolock()
623 Prepares the videobuf memory for the proper method (read, mmap, overlay).
624
625- videobuf_queue_is_busy()
626 Checks if a videobuf is streaming.
627
628- videobuf_queue_cancel()
629 Stops video handling.
630
631- videobuf_mmap_free()
632 frees mmap buffers.
633
634- videobuf_stop()
635 Stops video handling, ends mmap and frees mmap and other buffers.
636
637- V4L2 api functions. Those functions correspond to VIDIOC_foo ioctls:
638 videobuf_reqbufs(), videobuf_querybuf(), videobuf_qbuf(),
639 videobuf_dqbuf(), videobuf_streamon(), videobuf_streamoff().
640
641- V4L1 api function (corresponds to VIDIOCMBUF ioctl):
642 videobuf_cgmbuf()
643 This function is used to provide backward compatibility with V4L1
644 API.
645
646- Some help functions for read()/poll() operations:
647 videobuf_read_stream()
648 For continuous stream read()
649 videobuf_read_one()
650 For snapshot read()
651 videobuf_poll_stream()
652 polling help function
653
654The better way to understand it is to take a look at vivi driver. One
655of the main reasons for vivi is to be a videobuf usage example. the
656vivi_thread_tick() does the task that the IRQ callback would do on PCI
657drivers (or the irq callback on USB).
diff --git a/Documentation/video4linux/v4lgrab.c b/Documentation/video4linux/v4lgrab.c
index d6e70bef8ad0..05769cff1009 100644
--- a/Documentation/video4linux/v4lgrab.c
+++ b/Documentation/video4linux/v4lgrab.c
@@ -105,8 +105,8 @@ int main(int argc, char ** argv)
105 struct video_picture vpic; 105 struct video_picture vpic;
106 106
107 unsigned char *buffer, *src; 107 unsigned char *buffer, *src;
108 int bpp = 24, r, g, b; 108 int bpp = 24, r = 0, g = 0, b = 0;
109 unsigned int i, src_depth; 109 unsigned int i, src_depth = 16;
110 110
111 if (fd < 0) { 111 if (fd < 0) {
112 perror(VIDEO_DEV); 112 perror(VIDEO_DEV);
diff --git a/Documentation/video4linux/zr364xx.txt b/Documentation/video4linux/zr364xx.txt
index 5c81e3ae6458..7f3d1955d214 100644
--- a/Documentation/video4linux/zr364xx.txt
+++ b/Documentation/video4linux/zr364xx.txt
@@ -65,3 +65,4 @@ Vendor Product Distributor Model
650x06d6 0x003b Trust Powerc@m 970Z 650x06d6 0x003b Trust Powerc@m 970Z
660x0a17 0x004e Pentax Optio 50 660x0a17 0x004e Pentax Optio 50
670x041e 0x405d Creative DiVi CAM 516 670x041e 0x405d Creative DiVi CAM 516
680x08ca 0x2102 Aiptek DV T300
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2131b00b63f6..2f77ced35df7 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,5 +1,7 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3active_mm.txt
4 - An explanation from Linus about tsk->active_mm vs tsk->mm.
3balance 5balance
4 - various information on memory balancing. 6 - various information on memory balancing.
5hugetlbpage.txt 7hugetlbpage.txt
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
new file mode 100644
index 000000000000..4ee1f643d897
--- /dev/null
+++ b/Documentation/vm/active_mm.txt
@@ -0,0 +1,83 @@
1List: linux-kernel
2Subject: Re: active_mm
3From: Linus Torvalds <torvalds () transmeta ! com>
4Date: 1999-07-30 21:36:24
5
6Cc'd to linux-kernel, because I don't write explanations all that often,
7and when I do I feel better about more people reading them.
8
9On Fri, 30 Jul 1999, David Mosberger wrote:
10>
11> Is there a brief description someplace on how "mm" vs. "active_mm" in
12> the task_struct are supposed to be used? (My apologies if this was
13> discussed on the mailing lists---I just returned from vacation and
14> wasn't able to follow linux-kernel for a while).
15
16Basically, the new setup is:
17
18 - we have "real address spaces" and "anonymous address spaces". The
19 difference is that an anonymous address space doesn't care about the
20 user-level page tables at all, so when we do a context switch into an
21 anonymous address space we just leave the previous address space
22 active.
23
24 The obvious use for a "anonymous address space" is any thread that
25 doesn't need any user mappings - all kernel threads basically fall into
26 this category, but even "real" threads can temporarily say that for
27 some amount of time they are not going to be interested in user space,
28 and that the scheduler might as well try to avoid wasting time on
29 switching the VM state around. Currently only the old-style bdflush
30 sync does that.
31
32 - "tsk->mm" points to the "real address space". For an anonymous process,
33 tsk->mm will be NULL, for the logical reason that an anonymous process
34 really doesn't _have_ a real address space at all.
35
36 - however, we obviously need to keep track of which address space we
37 "stole" for such an anonymous user. For that, we have "tsk->active_mm",
38 which shows what the currently active address space is.
39
40 The rule is that for a process with a real address space (ie tsk->mm is
41 non-NULL) the active_mm obviously always has to be the same as the real
42 one.
43
44 For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
45 "borrowed" mm while the anonymous process is running. When the
46 anonymous process gets scheduled away, the borrowed address space is
47 returned and cleared.
48
49To support all that, the "struct mm_struct" now has two counters: a
50"mm_users" counter that is how many "real address space users" there are,
51and a "mm_count" counter that is the number of "lazy" users (ie anonymous
52users) plus one if there are any real users.
53
54Usually there is at least one real user, but it could be that the real
55user exited on another CPU while a lazy user was still active, so you do
56actually get cases where you have a address space that is _only_ used by
57lazy users. That is often a short-lived state, because once that thread
58gets scheduled away in favour of a real thread, the "zombie" mm gets
59released because "mm_users" becomes zero.
60
61Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
62more. "init_mm" should be considered just a "lazy context when no other
63context is available", and in fact it is mainly used just at bootup when
64no real VM has yet been created. So code that used to check
65
66 if (current->mm == &init_mm)
67
68should generally just do
69
70 if (!current->mm)
71
72instead (which makes more sense anyway - the test is basically one of "do
73we have a user context", and is generally done by the page fault handler
74and things like that).
75
76Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
77because it slightly changes the interfaces to accomodate the alpha (who
78would have thought it, but the alpha actually ends up having one of the
79ugliest context switch codes - unlike the other architectures where the MM
80and register state is separate, the alpha PALcode joins the two, and you
81need to switch both together).
82
83(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 6aaaeb38730c..be45dbb9d7f2 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -8,7 +8,8 @@ The current memory policy support was added to Linux 2.6 around May 2004. This
8document attempts to describe the concepts and APIs of the 2.6 memory policy 8document attempts to describe the concepts and APIs of the 2.6 memory policy
9support. 9support.
10 10
11Memory policies should not be confused with cpusets (Documentation/cpusets.txt) 11Memory policies should not be confused with cpusets
12(Documentation/cgroups/cpusets.txt)
12which is an administrative mechanism for restricting the nodes from which 13which is an administrative mechanism for restricting the nodes from which
13memory may be allocated by a set of processes. Memory policies are a 14memory may be allocated by a set of processes. Memory policies are a
14programming interface that a NUMA-aware application can take advantage of. When 15programming interface that a NUMA-aware application can take advantage of. When
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index d5fdfd34bbaf..6513fe2d90b8 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -37,7 +37,8 @@ locations.
37 37
38Larger installations usually partition the system using cpusets into 38Larger installations usually partition the system using cpusets into
39sections of nodes. Paul Jackson has equipped cpusets with the ability to 39sections of nodes. Paul Jackson has equipped cpusets with the ability to
40move pages when a task is moved to another cpuset (See ../cpusets.txt). 40move pages when a task is moved to another cpuset (See
41Documentation/cgroups/cpusets.txt).
41Cpusets allows the automation of process locality. If a task is moved to 42Cpusets allows the automation of process locality. If a task is moved to
42a new cpuset then also all its pages are moved with it so that the 43a new cpuset then also all its pages are moved with it so that the
43performance of the process does not sink dramatically. Also the pages 44performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 0706a7282a8c..2d70d0d95108 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -1,588 +1,691 @@
1 1 ==============================
2This document describes the Linux memory management "Unevictable LRU" 2 UNEVICTABLE LRU INFRASTRUCTURE
3infrastructure and the use of this infrastructure to manage several types 3 ==============================
4of "unevictable" pages. The document attempts to provide the overall 4
5rationale behind this mechanism and the rationale for some of the design 5========
6decisions that drove the implementation. The latter design rationale is 6CONTENTS
7discussed in the context of an implementation description. Admittedly, one 7========
8can obtain the implementation details--the "what does it do?"--by reading the 8
9code. One hopes that the descriptions below add value by provide the answer 9 (*) The Unevictable LRU
10to "why does it do that?". 10
11 11 - The unevictable page list.
12Unevictable LRU Infrastructure: 12 - Memory control group interaction.
13 13 - Marking address spaces unevictable.
14The Unevictable LRU adds an additional LRU list to track unevictable pages 14 - Detecting Unevictable Pages.
15and to hide these pages from vmscan. This mechanism is based on a patch by 15 - vmscan's handling of unevictable pages.
16Larry Woodman of Red Hat to address several scalability problems with page 16
17 (*) mlock()'d pages.
18
19 - History.
20 - Basic management.
21 - mlock()/mlockall() system call handling.
22 - Filtering special vmas.
23 - munlock()/munlockall() system call handling.
24 - Migrating mlocked pages.
25 - mmap(MAP_LOCKED) system call handling.
26 - munmap()/exit()/exec() system call handling.
27 - try_to_unmap().
28 - try_to_munlock() reverse map scan.
29 - Page reclaim in shrink_*_list().
30
31
32============
33INTRODUCTION
34============
35
36This document describes the Linux memory manager's "Unevictable LRU"
37infrastructure and the use of this to manage several types of "unevictable"
38pages.
39
40The document attempts to provide the overall rationale behind this mechanism
41and the rationale for some of the design decisions that drove the
42implementation. The latter design rationale is discussed in the context of an
43implementation description. Admittedly, one can obtain the implementation
44details - the "what does it do?" - by reading the code. One hopes that the
45descriptions below add value by provide the answer to "why does it do that?".
46
47
48===================
49THE UNEVICTABLE LRU
50===================
51
52The Unevictable LRU facility adds an additional LRU list to track unevictable
53pages and to hide these pages from vmscan. This mechanism is based on a patch
54by Larry Woodman of Red Hat to address several scalability problems with page
17reclaim in Linux. The problems have been observed at customer sites on large 55reclaim in Linux. The problems have been observed at customer sites on large
18memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB 56memory x86_64 systems.
19of main memory will have over 32 million 4k pages in a single zone. When a 57
20large fraction of these pages are not evictable for any reason [see below], 58To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
21vmscan will spend a lot of time scanning the LRU lists looking for the small 59main memory will have over 32 million 4k pages in a single zone. When a large
22fraction of pages that are evictable. This can result in a situation where 60fraction of these pages are not evictable for any reason [see below], vmscan
23all cpus are spending 100% of their time in vmscan for hours or days on end, 61will spend a lot of time scanning the LRU lists looking for the small fraction
24with the system completely unresponsive. 62of pages that are evictable. This can result in a situation where all CPUs are
25 63spending 100% of their time in vmscan for hours or days on end, with the system
26The Unevictable LRU infrastructure addresses the following classes of 64completely unresponsive.
27unevictable pages: 65
28 66The unevictable list addresses the following classes of unevictable pages:
29+ page owned by ramfs 67
30+ page mapped into SHM_LOCKed shared memory regions 68 (*) Those owned by ramfs.
31+ page mapped into VM_LOCKED [mlock()ed] vmas 69
32 70 (*) Those mapped into SHM_LOCK'd shared memory regions.
33The infrastructure might be able to handle other conditions that make pages 71
72 (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
73
74The infrastructure may also be able to handle other conditions that make pages
34unevictable, either by definition or by circumstance, in the future. 75unevictable, either by definition or by circumstance, in the future.
35 76
36 77
37The Unevictable LRU List 78THE UNEVICTABLE PAGE LIST
79-------------------------
38 80
39The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list 81The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
40called the "unevictable" list and an associated page flag, PG_unevictable, to 82called the "unevictable" list and an associated page flag, PG_unevictable, to
41indicate that the page is being managed on the unevictable list. The 83indicate that the page is being managed on the unevictable list.
42PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active 84
43flag in that it indicates on which LRU list a page resides when PG_lru is set. 85The PG_unevictable flag is analogous to, and mutually exclusive with, the
44The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU 86PG_active flag in that it indicates on which LRU list a page resides when
45Kconfig option. 87PG_lru is set. The unevictable list is compile-time configurable based on the
88UNEVICTABLE_LRU Kconfig option.
46 89
47The Unevictable LRU infrastructure maintains unevictable pages on an additional 90The Unevictable LRU infrastructure maintains unevictable pages on an additional
48LRU list for a few reasons: 91LRU list for a few reasons:
49 92
501) We get to "treat unevictable pages just like we treat other pages in the 93 (1) We get to "treat unevictable pages just like we treat other pages in the
51 system, which means we get to use the same code to manipulate them, the 94 system - which means we get to use the same code to manipulate them, the
52 same code to isolate them (for migrate, etc.), the same code to keep track 95 same code to isolate them (for migrate, etc.), the same code to keep track
53 of the statistics, etc..." [Rik van Riel] 96 of the statistics, etc..." [Rik van Riel]
97
98 (2) We want to be able to migrate unevictable pages between nodes for memory
99 defragmentation, workload management and memory hotplug. The linux kernel
100 can only migrate pages that it can successfully isolate from the LRU
101 lists. If we were to maintain pages elsewhere than on an LRU-like list,
102 where they can be found by isolate_lru_page(), we would prevent their
103 migration, unless we reworked migration code to find the unevictable pages
104 itself.
54 105
552) We want to be able to migrate unevictable pages between nodes--for memory
56 defragmentation, workload management and memory hotplug. The linux kernel
57 can only migrate pages that it can successfully isolate from the lru lists.
58 If we were to maintain pages elsewise than on an lru-like list, where they
59 can be found by isolate_lru_page(), we would prevent their migration, unless
60 we reworked migration code to find the unevictable pages.
61 106
107The unevictable list does not differentiate between file-backed and anonymous,
108swap-backed pages. This differentiation is only important while the pages are,
109in fact, evictable.
62 110
63The unevictable LRU list does not differentiate between file backed and swap 111The unevictable list benefits from the "arrayification" of the per-zone LRU
64backed [anon] pages. This differentiation is only important while the pages 112lists and statistics originally proposed and posted by Christoph Lameter.
65are, in fact, evictable.
66 113
67The unevictable LRU list benefits from the "arrayification" of the per-zone 114The unevictable list does not use the LRU pagevec mechanism. Rather,
68LRU lists and statistics originally proposed and posted by Christoph Lameter. 115unevictable pages are placed directly on the page's zone's unevictable list
116under the zone lru_lock. This allows us to prevent the stranding of pages on
117the unevictable list when one task has the page isolated from the LRU and other
118tasks are changing the "evictability" state of the page.
69 119
70The unevictable list does not use the lru pagevec mechanism. Rather,
71unevictable pages are placed directly on the page's zone's unevictable
72list under the zone lru_lock. The reason for this is to prevent stranding
73of pages on the unevictable list when one task has the page isolated from the
74lru and other tasks are changing the "evictability" state of the page.
75 120
121MEMORY CONTROL GROUP INTERACTION
122--------------------------------
76 123
77Unevictable LRU and Memory Controller Interaction 124The unevictable LRU facility interacts with the memory control group [aka
125memory controller; see Documentation/cgroups/memory.txt] by extending the
126lru_list enum.
127
128The memory controller data structure automatically gets a per-zone unevictable
129list as a result of the "arrayification" of the per-zone LRU lists (one per
130lru_list enum element). The memory controller tracks the movement of pages to
131and from the unevictable list.
78 132
79The memory controller data structure automatically gets a per zone unevictable
80lru list as a result of the "arrayification" of the per-zone LRU lists. The
81memory controller tracks the movement of pages to and from the unevictable list.
82When a memory control group comes under memory pressure, the controller will 133When a memory control group comes under memory pressure, the controller will
83not attempt to reclaim pages on the unevictable list. This has a couple of 134not attempt to reclaim pages on the unevictable list. This has a couple of
84effects. Because the pages are "hidden" from reclaim on the unevictable list, 135effects:
85the reclaim process can be more efficient, dealing only with pages that have 136
86a chance of being reclaimed. On the other hand, if too many of the pages 137 (1) Because the pages are "hidden" from reclaim on the unevictable list, the
87charged to the control group are unevictable, the evictable portion of the 138 reclaim process can be more efficient, dealing only with pages that have a
88working set of the tasks in the control group may not fit into the available 139 chance of being reclaimed.
89memory. This can cause the control group to thrash or to oom-kill tasks. 140
90 141 (2) On the other hand, if too many of the pages charged to the control group
91 142 are unevictable, the evictable portion of the working set of the tasks in
92Unevictable LRU: Detecting Unevictable Pages 143 the control group may not fit into the available memory. This can cause
93 144 the control group to thrash or to OOM-kill tasks.
94The function page_evictable(page, vma) in vmscan.c determines whether a 145
95page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, 146
96page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's 147MARKING ADDRESS SPACES UNEVICTABLE
97address space using a wrapper function. Wrapper functions are used to set, 148----------------------------------
98clear and test the flag to reduce the requirement for #ifdef's throughout the 149
99source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. 150For facilities such as ramfs none of the pages attached to the address space
100This flag remains for the life of the inode. 151may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE
101 152address space flag is provided, and this can be manipulated by a filesystem
102For shared memory regions, AS_UNEVICTABLE is set when an application 153using a number of wrapper functions:
103successfully SHM_LOCKs the region and is removed when the region is 154
104SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page 155 (*) void mapping_set_unevictable(struct address_space *mapping);
105tables for the region as does, for example, mlock(). So, we make no special 156
106effort to push any pages in the SHM_LOCKed region to the unevictable list. 157 Mark the address space as being completely unevictable.
107Vmscan will do this when/if it encounters the pages during reclaim. On 158
108SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the 159 (*) void mapping_clear_unevictable(struct address_space *mapping);
109unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed 160
110region is destroyed, the pages are also "rescued" from the unevictable list in 161 Mark the address space as being evictable.
111the process of freeing them. 162
112 163 (*) int mapping_unevictable(struct address_space *mapping);
113page_evictable() detects mlock()ed pages by testing an additional page flag, 164
114PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a 165 Query the address space, and return true if it is completely
115non-NULL vma is supplied, page_evictable() will check whether the vma is 166 unevictable.
167
168These are currently used in two places in the kernel:
169
170 (1) By ramfs to mark the address spaces of its inodes when they are created,
171 and this mark remains for the life of the inode.
172
173 (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
174
175 Note that SHM_LOCK is not required to page in the locked pages if they're
176 swapped out; the application must touch the pages manually if it wants to
177 ensure they're in memory.
178
179
180DETECTING UNEVICTABLE PAGES
181---------------------------
182
183The function page_evictable() in vmscan.c determines whether a page is
184evictable or not using the query function outlined above [see section "Marking
185address spaces unevictable"] to check the AS_UNEVICTABLE flag.
186
187For address spaces that are so marked after being populated (as SHM regions
188might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
189the page tables for the region as does, for example, mlock(), nor need it make
190any special effort to push any pages in the SHM_LOCK'd area to the unevictable
191list. Instead, vmscan will do this if and when it encounters the pages during
192a reclamation scan.
193
194On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
195the pages in the region and "rescue" them from the unevictable list if no other
196condition is keeping them unevictable. If an unevictable region is destroyed,
197the pages are also "rescued" from the unevictable list in the process of
198freeing them.
199
200page_evictable() also checks for mlocked pages by testing an additional page
201flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked,
202and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
116VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and 203VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and
117update the appropriate statistics if the vma is VM_LOCKED. This method allows 204update the appropriate statistics if the vma is VM_LOCKED. This method allows
118efficient "culling" of pages in the fault path that are being faulted in to 205efficient "culling" of pages in the fault path that are being faulted in to
119VM_LOCKED vmas. 206VM_LOCKED VMAs.
120 207
121 208
122Unevictable Pages and Vmscan [shrink_*_list()] 209VMSCAN'S HANDLING OF UNEVICTABLE PAGES
210--------------------------------------
123 211
124If unevictable pages are culled in the fault path, or moved to the unevictable 212If unevictable pages are culled in the fault path, or moved to the unevictable
125list at mlock() or mmap() time, vmscan will never encounter the pages until 213list at mlock() or mmap() time, vmscan will not encounter the pages until they
126they have become evictable again, for example, via munlock() and have been 214have become evictable again (via munlock() for example) and have been "rescued"
127"rescued" from the unevictable list. However, there may be situations where we 215from the unevictable list. However, there may be situations where we decide,
128decide, for the sake of expediency, to leave a unevictable page on one of the 216for the sake of expediency, to leave a unevictable page on one of the regular
129regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for 217active/inactive LRU lists for vmscan to deal with. vmscan checks for such
130such pages in all of the shrink_{active|inactive|page}_list() functions and 218pages in all of the shrink_{active|inactive|page}_list() functions and will
131will "cull" such pages that it encounters--that is, it diverts those pages to 219"cull" such pages that it encounters: that is, it diverts those pages to the
132the unevictable list for the zone being scanned. 220unevictable list for the zone being scanned.
133 221
134There may be situations where a page is mapped into a VM_LOCKED vma, but the 222There may be situations where a page is mapped into a VM_LOCKED VMA, but the
135page is not marked as PageMlocked. Such pages will make it all the way to 223page is not marked as PG_mlocked. Such pages will make it all the way to
136shrink_page_list() where they will be detected when vmscan walks the reverse 224shrink_page_list() where they will be detected when vmscan walks the reverse
137map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() 225map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK,
138will cull the page at that point. 226shrink_page_list() will cull the page at that point.
139 227
140To "cull" an unevictable page, vmscan simply puts the page back on the lru 228To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
141list using putback_lru_page()--the inverse operation to isolate_lru_page()-- 229using putback_lru_page() - the inverse operation to isolate_lru_page() - after
142after dropping the page lock. Because the condition which makes the page 230dropping the page lock. Because the condition which makes the page unevictable
143unevictable may change once the page is unlocked, putback_lru_page() will 231may change once the page is unlocked, putback_lru_page() will recheck the
144recheck the unevictable state of a page that it places on the unevictable lru 232unevictable state of a page that it places on the unevictable list. If the
145list. If the page has become unevictable, putback_lru_page() removes it from 233page has become unevictable, putback_lru_page() removes it from the list and
146the list and retries, including the page_unevictable() test. Because such a 234retries, including the page_unevictable() test. Because such a race is a rare
147race is a rare event and movement of pages onto the unevictable list should be 235event and movement of pages onto the unevictable list should be rare, these
148rare, these extra evictabilty checks should not occur in the majority of calls 236extra evictabilty checks should not occur in the majority of calls to
149to putback_lru_page(). 237putback_lru_page().
150 238
151 239
152Mlocked Page: Prior Work 240=============
241MLOCKED PAGES
242=============
153 243
154The "Unevictable Mlocked Pages" infrastructure is based on work originally 244The unevictable page list is also useful for mlock(), in addition to ramfs and
245SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in
246NOMMU situations, all mappings are effectively mlocked.
247
248
249HISTORY
250-------
251
252The "Unevictable mlocked Pages" infrastructure is based on work originally
155posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". 253posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
156Nick posted his patch as an alternative to a patch posted by Christoph 254Nick posted his patch as an alternative to a patch posted by Christoph Lameter
157Lameter to achieve the same objective--hiding mlocked pages from vmscan. 255to achieve the same objective: hiding mlocked pages from vmscan.
158In Nick's patch, he used one of the struct page lru list link fields as a count 256
159of VM_LOCKED vmas that map the page. This use of the link field for a count 257In Nick's patch, he used one of the struct page LRU list link fields as a count
160prevented the management of the pages on an LRU list. Thus, mlocked pages were 258of VM_LOCKED VMAs that map the page. This use of the link field for a count
161not migratable as isolate_lru_page() could not find them and the lru list link 259prevented the management of the pages on an LRU list, and thus mlocked pages
162field was not available to the migration subsystem. Nick resolved this by 260were not migratable as isolate_lru_page() could not find them, and the LRU list
163putting mlocked pages back on the lru list before attempting to isolate them, 261link field was not available to the migration subsystem.
164thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated 262
165with the Unevictable LRU work, the count was replaced by walking the reverse 263Nick resolved this by putting mlocked pages back on the lru list before
166map to determine whether any VM_LOCKED vmas mapped the page. More on this 264attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When
167below. 265Nick's patch was integrated with the Unevictable LRU work, the count was
168 266replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
169 267mapped the page. More on this below.
170Mlocked Pages: Basic Management 268
171 269
172Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of 270BASIC MANAGEMENT
173unevictable pages. When such a page has been "noticed" by the memory 271----------------
174management subsystem, the page is marked with the PG_mlocked [PageMlocked()] 272
175flag. A PageMlocked() page will be placed on the unevictable LRU list when 273mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
176it is added to the LRU. Pages can be "noticed" by memory management in 274pages. When such a page has been "noticed" by the memory management subsystem,
177several places: 275the page is marked with the PG_mlocked flag. This can be manipulated using the
178 276PageMlocked() functions.
1791) in the mlock()/mlockall() system call handlers. 277
1802) in the mmap() system call handler when mmap()ing a region with the 278A PG_mlocked page will be placed on the unevictable list when it is added to
181 MAP_LOCKED flag, or mmap()ing a region in a task that has called 279the LRU. Such pages can be "noticed" by memory management in several places:
182 mlockall() with the MCL_FUTURE flag. Both of these conditions result 280
183 in the VM_LOCKED flag being set for the vma. 281 (1) in the mlock()/mlockall() system call handlers;
1843) in the fault path, if mlocked pages are "culled" in the fault path, 282
185 and when a VM_LOCKED stack segment is expanded. 283 (2) in the mmap() system call handler when mmapping a region with the
1864) as mentioned above, in vmscan:shrink_page_list() when attempting to 284 MAP_LOCKED flag;
187 reclaim a page in a VM_LOCKED vma via try_to_unmap(). 285
188 286 (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
189Mlocked pages become unlocked and rescued from the unevictable list when: 287 flag
190 288
1911) mapped in a range unlocked via the munlock()/munlockall() system calls. 289 (4) in the fault path, if mlocked pages are "culled" in the fault path,
1922) munmapped() out of the last VM_LOCKED vma that maps the page, including 290 and when a VM_LOCKED stack segment is expanded; or
193 unmapping at task exit. 291
1943) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. 292 (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
1954) before a page is COWed in a VM_LOCKED vma. 293 reclaim a page in a VM_LOCKED VMA via try_to_unmap()
196 294
197 295all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
198Mlocked Pages: mlock()/mlockall() System Call Handling 296already have it set.
297
298mlocked pages become unlocked and rescued from the unevictable list when:
299
300 (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
301
302 (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
303 unmapping at task exit;
304
305 (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
306 or
307
308 (4) before a page is COW'd in a VM_LOCKED VMA.
309
310
311mlock()/mlockall() SYSTEM CALL HANDLING
312---------------------------------------
199 313
200Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() 314Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
201for each vma in the range specified by the call. In the case of mlockall(), 315for each VMA in the range specified by the call. In the case of mlockall(),
202this is the entire active address space of the task. Note that mlock_fixup() 316this is the entire active address space of the task. Note that mlock_fixup()
203is used for both mlock()ing and munlock()ing a range of memory. A call to 317is used for both mlocking and munlocking a range of memory. A call to mlock()
204mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED 318an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
205is treated as a no-op--mlock_fixup() simply returns. 319treated as a no-op, and mlock_fixup() simply returns.
206 320
207If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" 321If the VMA passes some filtering as described in "Filtering Special Vmas"
208below, mlock_fixup() will attempt to merge the vma with its neighbors or split 322below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
209off a subset of the vma if the range does not cover the entire vma. Once the 323off a subset of the VMA if the range does not cover the entire VMA. Once the
210vma has been merged or split or neither, mlock_fixup() will call 324VMA has been merged or split or neither, mlock_fixup() will call
211__mlock_vma_pages_range() to fault in the pages via get_user_pages() and 325__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
212to mark the pages as mlocked via mlock_vma_page(). 326mark the pages as mlocked via mlock_vma_page().
213 327
214Note that the vma being mlocked might be mapped with PROT_NONE. In this case, 328Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
215get_user_pages() will be unable to fault in the pages. That's OK. If pages 329get_user_pages() will be unable to fault in the pages. That's okay. If pages
216do end up getting faulted into this VM_LOCKED vma, we'll handle them in the 330do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
217fault path or in vmscan. 331fault path or in vmscan.
218 332
219Also note that a page returned by get_user_pages() could be truncated or 333Also note that a page returned by get_user_pages() could be truncated or
220migrated out from under us, while we're trying to mlock it. To detect 334migrated out from under us, while we're trying to mlock it. To detect this,
221this, __mlock_vma_pages_range() tests the page_mapping after acquiring 335__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
222the page lock. If the page is still associated with its mapping, we'll 336If the page is still associated with its mapping, we'll go ahead and call
223go ahead and call mlock_vma_page(). If the mapping is gone, we just 337mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
224unlock the page and move on. Worse case, this results in page mapped 338In the worst case, this will result in a page mapped in a VM_LOCKED VMA
225in a VM_LOCKED vma remaining on a normal LRU list without being 339remaining on a normal LRU list without being PageMlocked(). Again, vmscan will
226PageMlocked(). Again, vmscan will detect and cull such pages. 340detect and cull such pages.
227 341
228mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will 342mlock_vma_page() will call TestSetPageMlocked() for each page returned by
229TestSetPageMlocked() for each page returned by get_user_pages(). We use 343get_user_pages(). We use TestSetPageMlocked() because the page might already
230TestSetPageMlocked() because the page might already be mlocked by another 344be mlocked by another task/VMA and we don't want to do extra work. We
231task/vma and we don't want to do extra work. We especially do not want to 345especially do not want to count an mlocked page more than once in the
232count an mlocked page more than once in the statistics. If the page was 346statistics. If the page was already mlocked, mlock_vma_page() need do nothing
233already mlocked, mlock_vma_page() is done. 347more.
234 348
235If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the 349If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
236page from the LRU, as it is likely on the appropriate active or inactive list 350page from the LRU, as it is likely on the appropriate active or inactive list
237at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will 351at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put
238putback the page--putback_lru_page()--which will notice that the page is now 352back the page - by calling putback_lru_page() - which will notice that the page
239mlocked and divert the page to the zone's unevictable LRU list. If 353is now mlocked and divert the page to the zone's unevictable list. If
240mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle 354mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
241it later if/when it attempts to reclaim the page. 355it later if and when it attempts to reclaim the page.
242 356
243 357
244Mlocked Pages: Filtering Special Vmas 358FILTERING SPECIAL VMAS
359----------------------
245 360
246mlock_fixup() filters several classes of "special" vmas: 361mlock_fixup() filters several classes of "special" VMAs:
247 362
2481) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind 3631) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind
249 these mappings are inherently pinned, so we don't need to mark them as 364 these mappings are inherently pinned, so we don't need to mark them as
250 mlocked. In any case, most of the pages have no struct page in which to 365 mlocked. In any case, most of the pages have no struct page in which to so
251 so mark the page. Because of this, get_user_pages() will fail for these 366 mark the page. Because of this, get_user_pages() will fail for these VMAs,
252 vmas, so there is no sense in attempting to visit them. 367 so there is no sense in attempting to visit them.
253 368
2542) vmas mapping hugetlbfs page are already effectively pinned into memory. 3692) VMAs mapping hugetlbfs page are already effectively pinned into memory. We
255 We don't need nor want to mlock() these pages. However, to preserve the 370 neither need nor want to mlock() these pages. However, to preserve the
256 prior behavior of mlock()--before the unevictable/mlock changes-- 371 prior behavior of mlock() - before the unevictable/mlock changes -
257 mlock_fixup() will call make_pages_present() in the hugetlbfs vma range 372 mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
258 to allocate the huge pages and populate the ptes. 373 allocate the huge pages and populate the ptes.
259 374
2603) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of 3753) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
261 kernel pages, such as the vdso page, relay channel pages, etc. These pages 376 kernel pages, such as the VDSO page, relay channel pages, etc. These pages
262 are inherently unevictable and are not managed on the LRU lists. 377 are inherently unevictable and are not managed on the LRU lists.
263 mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls 378 mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls
264 make_pages_present() to populate the ptes. 379 make_pages_present() to populate the ptes.
265 380
266Note that for all of these special vmas, mlock_fixup() does not set the 381Note that for all of these special VMAs, mlock_fixup() does not set the
267VM_LOCKED flag. Therefore, we won't have to deal with them later during 382VM_LOCKED flag. Therefore, we won't have to deal with them later during
268munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() 383munlock(), munmap() or task exit. Neither does mlock_fixup() account these
269account these vmas against the task's "locked_vm". 384VMAs against the task's "locked_vm".
270 385
271Mlocked Pages: Downgrading the Mmap Semaphore. 386
272 387munlock()/munlockall() SYSTEM CALL HANDLING
273mlock_fixup() must be called with the mmap semaphore held for write, because 388-------------------------------------------
274it may have to merge or split vmas. However, mlocking a large region of 389
275memory can take a long time--especially if vmscan must reclaim pages to 390The munlock() and munlockall() system calls are handled by the same functions -
276satisfy the regions requirements. Faulting in a large region with the mmap 391do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
277semaphore held for write can hold off other faults on the address space, in 392lock operation indicated by an argument. So, these system calls are also
278the case of a multi-threaded task. It can also hold off scans of the task's 393handled by mlock_fixup(). Again, if called for an already munlocked VMA,
279address space via /proc. While testing under heavy load, it was observed that 394mlock_fixup() simply returns. Because of the VMA filtering discussed above,
280the ps(1) command could be held off for many minutes while a large segment was 395VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be
281mlock()ed down.
282
283To address this issue, and to make the system more responsive during mlock()ing
284of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
285during the call to __mlock_vma_pages_range(). This works fine. However, the
286callers of mlock_fixup() expect the semaphore to be returned in write mode.
287So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not
288support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
289and reacquire it in write mode. In a multi-threaded task, it is possible for
290the task memory map to change while the semaphore is dropped. Therefore,
291mlock_fixup() looks up the vma at the range start address after reacquiring
292the semaphore in write mode and verifies that it still covers the original
293range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of
294mlock_fixup() have been changed to deal with this new error condition.
295
296Note: when munlocking a region, all of the pages should already be resident--
297unless we have racing threads mlocking() and munlocking() regions. So,
298unlocking should not have to wait for page allocations nor faults of any kind.
299Therefore mlock_fixup() does not downgrade the semaphore for munlock().
300
301
302Mlocked Pages: munlock()/munlockall() System Call Handling
303
304The munlock() and munlockall() system calls are handled by the same functions--
305do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
306vs lock operation indicated by an argument. So, these system calls are also
307handled by mlock_fixup(). Again, if called for an already munlock()ed vma,
308mlock_fixup() simply returns. Because of the vma filtering discussed above,
309VM_LOCKED will not be set in any "special" vmas. So, these vmas will be
310ignored for munlock. 396ignored for munlock.
311 397
312If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off 398If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
313the specified range. The range is then munlocked via the function 399specified range. The range is then munlocked via the function
314__mlock_vma_pages_range()--the same function used to mlock a vma range-- 400__mlock_vma_pages_range() - the same function used to mlock a VMA range -
315passing a flag to indicate that munlock() is being performed. 401passing a flag to indicate that munlock() is being performed.
316 402
317Because the vma access protections could have been changed to PROT_NONE after 403Because the VMA access protections could have been changed to PROT_NONE after
318faulting in and mlocking pages, get_user_pages() was unreliable for visiting 404faulting in and mlocking pages, get_user_pages() was unreliable for visiting
319these pages for munlocking. Because we don't want to leave pages mlocked(), 405these pages for munlocking. Because we don't want to leave pages mlocked,
320get_user_pages() was enhanced to accept a flag to ignore the permissions when 406get_user_pages() was enhanced to accept a flag to ignore the permissions when
321fetching the pages--all of which should be resident as a result of previous 407fetching the pages - all of which should be resident as a result of previous
322mlock()ing. 408mlocking.
323 409
324For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling 410For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
325munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked 411munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
326flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() 412flag using TestClearPageMlocked(). As with mlock_vma_page(),
327use the Test*PageMlocked() function to handle the case where the page might 413munlock_vma_page() use the Test*PageMlocked() function to handle the case where
328have already been unlocked by another task. If the page was mlocked, 414the page might have already been unlocked by another task. If the page was
329munlock_vma_page() updates that zone statistics for the number of mlocked 415mlocked, munlock_vma_page() updates that zone statistics for the number of
330pages. Note, however, that at this point we haven't checked whether the page 416mlocked pages. Note, however, that at this point we haven't checked whether
331is mapped by other VM_LOCKED vmas. 417the page is mapped by other VM_LOCKED VMAs.
332 418
333We can't call try_to_munlock(), the function that walks the reverse map to check 419We can't call try_to_munlock(), the function that walks the reverse map to
334for other VM_LOCKED vmas, without first isolating the page from the LRU. 420check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
335try_to_munlock() is a variant of try_to_unmap() and thus requires that the page 421try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
336not be on an lru list. [More on these below.] However, the call to 422not be on an LRU list [more on these below]. However, the call to
337isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). 423isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So,
338So, we go ahead and clear PG_mlocked up front, as this might be the only chance 424we go ahead and clear PG_mlocked up front, as this might be the only chance we
339we have. If we can successfully isolate the page, we go ahead and 425have. If we can successfully isolate the page, we go ahead and
340try_to_munlock(), which will restore the PG_mlocked flag and update the zone 426try_to_munlock(), which will restore the PG_mlocked flag and update the zone
341page statistics if it finds another vma holding the page mlocked. If we fail 427page statistics if it finds another VMA holding the page mlocked. If we fail
342to isolate the page, we'll have left a potentially mlocked page on the LRU. 428to isolate the page, we'll have left a potentially mlocked page on the LRU.
343This is fine, because we'll catch it later when/if vmscan tries to reclaim the 429This is fine, because we'll catch it later if and if vmscan tries to reclaim
344page. This should be relatively rare. 430the page. This should be relatively rare.
345 431
346Mlocked Pages: Migrating Them... 432
347 433MIGRATING MLOCKED PAGES
348A page that is being migrated has been isolated from the lru lists and is 434-----------------------
349held locked across unmapping of the page, updating the page's mapping 435
350[address_space] entry and copying the contents and state, until the 436A page that is being migrated has been isolated from the LRU lists and is held
351page table entry has been replaced with an entry that refers to the new 437locked across unmapping of the page, updating the page's address space entry
352page. Linux supports migration of mlocked pages and other unevictable 438and copying the contents and state, until the page table entry has been
353pages. This involves simply moving the PageMlocked and PageUnevictable states 439replaced with an entry that refers to the new page. Linux supports migration
354from the old page to the new page. 440of mlocked pages and other unevictable pages. This involves simply moving the
355 441PG_mlocked and PG_unevictable states from the old page to the new page.
356Note that page migration can race with mlocking or munlocking of the same 442
357page. This has been discussed from the mlock/munlock perspective in the 443Note that page migration can race with mlocking or munlocking of the same page.
358respective sections above. Both processes [migration, m[un]locking], hold 444This has been discussed from the mlock/munlock perspective in the respective
359the page locked. This provides the first level of synchronization. Page 445sections above. Both processes (migration and m[un]locking) hold the page
360migration zeros out the page_mapping of the old page before unlocking it, 446locked. This provides the first level of synchronization. Page migration
361so m[un]lock can skip these pages by testing the page mapping under page 447zeros out the page_mapping of the old page before unlocking it, so m[un]lock
362lock. 448can skip these pages by testing the page mapping under page lock.
363 449
364When completing page migration, we place the new and old pages back onto the 450To complete page migration, we place the new and old pages back onto the LRU
365lru after dropping the page lock. The "unneeded" page--old page on success, 451after dropping the page lock. The "unneeded" page - old page on success, new
366new page on failure--will be freed when the reference count held by the 452page on failure - will be freed when the reference count held by the migration
367migration process is released. To ensure that we don't strand pages on the 453process is released. To ensure that we don't strand pages on the unevictable
368unevictable list because of a race between munlock and migration, page 454list because of a race between munlock and migration, page migration uses the
369migration uses the putback_lru_page() function to add migrated pages back to 455putback_lru_page() function to add migrated pages back to the LRU.
370the lru. 456
371 457
372 458mmap(MAP_LOCKED) SYSTEM CALL HANDLING
373Mlocked Pages: mmap(MAP_LOCKED) System Call Handling 459-------------------------------------
374 460
375In addition the the mlock()/mlockall() system calls, an application can request 461In addition the the mlock()/mlockall() system calls, an application can request
376that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() 462that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
377call. Furthermore, any mmap() call or brk() call that expands the heap by a 463call. Furthermore, any mmap() call or brk() call that expands the heap by a
378task that has previously called mlockall() with the MCL_FUTURE flag will result 464task that has previously called mlockall() with the MCL_FUTURE flag will result
379in the newly mapped memory being mlocked. Before the unevictable/mlock changes, 465in the newly mapped memory being mlocked. Before the unevictable/mlock
380the kernel simply called make_pages_present() to allocate pages and populate 466changes, the kernel simply called make_pages_present() to allocate pages and
381the page table. 467populate the page table.
382 468
383To mlock a range of memory under the unevictable/mlock infrastructure, the 469To mlock a range of memory under the unevictable/mlock infrastructure, the
384mmap() handler and task address space expansion functions call 470mmap() handler and task address space expansion functions call
385mlock_vma_pages_range() specifying the vma and the address range to mlock. 471mlock_vma_pages_range() specifying the vma and the address range to mlock.
386mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in 472mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
387"Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will 473"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have
388have already been set by the caller, in filtered vmas. Thus these vma's need 474already been set by the caller, in filtered VMAs. Thus these VMA's need not be
389not be visited for munlock when the region is unmapped. 475visited for munlock when the region is unmapped.
390 476
391For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to 477For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
392fault/allocate the pages and mlock them. Again, like mlock_fixup(), 478fault/allocate the pages and mlock them. Again, like mlock_fixup(),
393mlock_vma_pages_range() downgrades the mmap semaphore to read mode before 479mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
394attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore 480attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
395back to write mode before returning. 481back to write mode before returning.
396 482
397The callers of mlock_vma_pages_range() will have already added the memory 483The callers of mlock_vma_pages_range() will have already added the memory range
398range to be mlocked to the task's "locked_vm". To account for filtered vmas, 484to be mlocked to the task's "locked_vm". To account for filtered VMAs,
399mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the 485mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the
400callers then subtract a non-negative return value from the task's locked_vm. 486callers then subtract a non-negative return value from the task's locked_vm. A
401A negative return value represent an error--for example, from get_user_pages() 487negative return value represent an error - for example, from get_user_pages()
402attempting to fault in a vma with PROT_NONE access. In this case, we leave 488attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
403the memory range accounted as locked_vm, as the protections could be changed 489memory range accounted as locked_vm, as the protections could be changed later
404later and pages allocated into that region. 490and pages allocated into that region.
405 491
406 492
407Mlocked Pages: munmap()/exit()/exec() System Call Handling 493munmap()/exit()/exec() SYSTEM CALL HANDLING
494-------------------------------------------
408 495
409When unmapping an mlocked region of memory, whether by an explicit call to 496When unmapping an mlocked region of memory, whether by an explicit call to
410munmap() or via an internal unmap from exit() or exec() processing, we must 497munmap() or via an internal unmap from exit() or exec() processing, we must
411munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. 498munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
412Before the unevictable/mlock changes, mlocking did not mark the pages in any 499Before the unevictable/mlock changes, mlocking did not mark the pages in any
413way, so unmapping them required no processing. 500way, so unmapping them required no processing.
414 501
415To munlock a range of memory under the unevictable/mlock infrastructure, the 502To munlock a range of memory under the unevictable/mlock infrastructure, the
416munmap() hander and task address space tear down function call 503munmap() handler and task address space call tear down function
417munlock_vma_pages_all(). The name reflects the observation that one always 504munlock_vma_pages_all(). The name reflects the observation that one always
418specifies the entire vma range when munlock()ing during unmap of a region. 505specifies the entire VMA range when munlock()ing during unmap of a region.
419Because of the vma filtering when mlocking() regions, only "normal" vmas that 506Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
420actually contain mlocked pages will be passed to munlock_vma_pages_all(). 507actually contain mlocked pages will be passed to munlock_vma_pages_all().
421 508
422munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() 509munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
423for the munlock case, calls __munlock_vma_pages_range() to walk the page table 510for the munlock case, calls __munlock_vma_pages_range() to walk the page table
424for the vma's memory range and munlock_vma_page() each resident page mapped by 511for the VMA's memory range and munlock_vma_page() each resident page mapped by
425the vma. This effectively munlocks the page, only if this is the last 512the VMA. This effectively munlocks the page, only if this is the last
426VM_LOCKED vma that maps the page. 513VM_LOCKED VMA that maps the page.
427
428 514
429Mlocked Page: try_to_unmap()
430 515
431[Note: the code changes represented by this section are really quite small 516try_to_unmap()
432compared to the text to describe what happening and why, and to discuss the 517--------------
433implications.]
434 518
435Pages can, of course, be mapped into multiple vmas. Some of these vmas may 519Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may
436have VM_LOCKED flag set. It is possible for a page mapped into one or more 520have VM_LOCKED flag set. It is possible for a page mapped into one or more
437VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one 521VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
438of the active or inactive LRU lists. This could happen if, for example, a 522of the active or inactive LRU lists. This could happen if, for example, a task
439task in the process of munlock()ing the page could not isolate the page from 523in the process of munlocking the page could not isolate the page from the LRU.
440the LRU. As a result, vmscan/shrink_page_list() might encounter such a page 524As a result, vmscan/shrink_page_list() might encounter such a page as described
441as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To 525in section "vmscan's handling of unevictable pages". To handle this situation,
442handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED 526try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
443vmas while it is walking a page's reverse map. 527map.
444 528
445try_to_unmap() is always called, by either vmscan for reclaim or for page 529try_to_unmap() is always called, by either vmscan for reclaim or for page
446migration, with the argument page locked and isolated from the LRU. BUG_ON() 530migration, with the argument page locked and isolated from the LRU. Separate
447assertions enforce this requirement. Separate functions handle anonymous and 531functions handle anonymous and mapped file pages, as these types of pages have
448mapped file pages, as these types of pages have different reverse map 532different reverse map mechanisms.
449mechanisms. 533
450 534 (*) try_to_unmap_anon()
451 try_to_unmap_anon() 535
452 536 To unmap anonymous pages, each VMA in the list anchored in the anon_vma
453To unmap anonymous pages, each vma in the list anchored in the anon_vma must be 537 must be visited - at least until a VM_LOCKED VMA is encountered. If the
454visited--at least until a VM_LOCKED vma is encountered. If the page is being 538 page is being unmapped for migration, VM_LOCKED VMAs do not stop the
455unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked 539 process because mlocked pages are migratable. However, for reclaim, if
456pages are migratable. However, for reclaim, if the page is mapped into a 540 the page is mapped into a VM_LOCKED VMA, the scan stops.
457VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap 541
458semphore of the mm_struct to which the vma belongs in read mode. If this is 542 try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of
459successful, try_to_unmap() will mlock the page via mlock_vma_page()--we 543 the mm_struct to which the VMA belongs. If this is successful, it will
460wouldn't have gotten to try_to_unmap() if the page were already mlocked--and 544 mlock the page via mlock_vma_page() - we wouldn't have gotten to
461will return SWAP_MLOCK, indicating that the page is unevictable. If the 545 try_to_unmap_anon() if the page were already mlocked - and will return
462mmap semaphore cannot be acquired, we are not sure whether the page is really 546 SWAP_MLOCK, indicating that the page is unevictable.
463unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. 547
464 548 If the mmap semaphore cannot be acquired, we are not sure whether the page
465 try_to_unmap_file() -- linear mappings 549 is really unevictable or not. In this case, try_to_unmap_anon() will
466 550 return SWAP_AGAIN.
467Unmapping of a mapped file page works the same, except that the scan visits 551
468all vmas that maps the page's index/page offset in the page's mapping's 552 (*) try_to_unmap_file() - linear mappings
469reverse map priority search tree. It must also visit each vma in the page's 553
470mapping's non-linear list, if the list is non-empty. As for anonymous pages, 554 Unmapping of a mapped file page works the same as for anonymous mappings,
471on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will 555 except that the scan visits all VMAs that map the page's index/page offset
472attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, 556 in the page's mapping's reverse map priority search tree. It also visits
473returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. 557 each VMA in the page's mapping's non-linear list, if the list is
474 558 non-empty.
475 try_to_unmap_file() -- non-linear mappings 559
476 560 As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
477If a page's mapping contains a non-empty non-linear mapping vma list, then 561 page, try_to_unmap_file() will attempt to acquire the associated
478try_to_un{map|lock}() must also visit each vma in that list to determine 562 mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
479whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit 563 is successful, and SWAP_AGAIN, if not.
480all vmas in the non-linear list to ensure that the pages is not/should not be 564
481mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. 565 (*) try_to_unmap_file() - non-linear mappings
482However, there is no easy way to determine whether the page is actually mapped 566
483in a given vma--either for unmapping or testing whether the VM_LOCKED vma 567 If a page's mapping contains a non-empty non-linear mapping VMA list, then
484actually pins the page. 568 try_to_un{map|lock}() must also visit each VMA in that list to determine
485 569 whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
486So, try_to_unmap_file() handles non-linear mappings by scanning a certain 570 all VMAs in the non-linear list to ensure that the pages is not/should not
487number of pages--a "cluster"--in each non-linear vma associated with the page's 571 be mlocked.
488mapping, for each file mapped page that vmscan tries to unmap. If this happens 572
489to unmap the page we're trying to unmap, try_to_unmap() will notice this on 573 If a VM_LOCKED VMA is found in the list, the scan could terminate.
490return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it 574 However, there is no easy way to determine whether the page is actually
491will return SWAP_AGAIN, causing vmscan to recirculate this page. We take 575 mapped in a given VMA - either for unmapping or testing whether the
492advantage of the cluster scan in try_to_unmap_cluster() as follows: 576 VM_LOCKED VMA actually pins the page.
493 577
494For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap 578 try_to_unmap_file() handles non-linear mappings by scanning a certain
495semaphore of the associated mm_struct for read without blocking. If this 579 number of pages - a "cluster" - in each non-linear VMA associated with the
496attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will 580 page's mapping, for each file mapped page that vmscan tries to unmap. If
497retain the mmap semaphore for the scan; otherwise it drops it here. Then, 581 this happens to unmap the page we're trying to unmap, try_to_unmap() will
498for each page in the cluster, if we're holding the mmap semaphore for a locked 582 notice this on return (page_mapcount(page) will be 0) and return
499vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This 583 SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
500call is a no-op if the page is already locked, but will mlock any pages in 584 recirculate this page. We take advantage of the cluster scan in
501the non-linear mapping that happen to be unlocked. If one of the pages so 585 try_to_unmap_cluster() as follows:
502mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will 586
503return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan 587 For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
504to cull the page, rather than recirculating it on the inactive list. Again, 588 mmap semaphore of the associated mm_struct for read without blocking.
505if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns 589
506SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but 590 If this attempt is successful and the VMA is VM_LOCKED,
507couldn't be mlocked. 591 try_to_unmap_cluster() will retain the mmap semaphore for the scan;
508 592 otherwise it drops it here.
509 593
510Mlocked pages: try_to_munlock() Reverse Map Scan 594 Then, for each page in the cluster, if we're holding the mmap semaphore
511 595 for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
512TODO/FIXME: a better name might be page_mlocked()--analogous to the 596 mlock the page. This call is a no-op if the page is already locked,
513page_referenced() reverse map walker. 597 but will mlock any pages in the non-linear mapping that happen to be
514 598 unlocked.
515When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() 599
516System Call Handling" above--tries to munlock a page, it needs to 600 If one of the pages so mlocked is the page passed in to try_to_unmap(),
517determine whether or not the page is mapped by any VM_LOCKED vma, without 601 try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
518actually attempting to unmap all ptes from the page. For this purpose, the 602 SWAP_AGAIN. This will allow vmscan to cull the page, rather than
519unevictable/mlock infrastructure introduced a variant of try_to_unmap() called 603 recirculating it on the inactive list.
520try_to_munlock(). 604
605 Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
606 returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
607 VMA, but couldn't be mlocked.
608
609
610try_to_munlock() REVERSE MAP SCAN
611---------------------------------
612
613 [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
614 page_referenced() reverse map walker.
615
616When munlock_vma_page() [see section "munlock()/munlockall() System Call
617Handling" above] tries to munlock a page, it needs to determine whether or not
618the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
619all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
620introduced a variant of try_to_unmap() called try_to_munlock().
521 621
522try_to_munlock() calls the same functions as try_to_unmap() for anonymous and 622try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
523mapped file pages with an additional argument specifing unlock versus unmap 623mapped file pages with an additional argument specifing unlock versus unmap
524processing. Again, these functions walk the respective reverse maps looking 624processing. Again, these functions walk the respective reverse maps looking
525for VM_LOCKED vmas. When such a vma is found for anonymous pages and file 625for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file
526pages mapped in linear VMAs, as in the try_to_unmap() case, the functions 626pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
527attempt to acquire the associated mmap semphore, mlock the page via 627attempt to acquire the associated mmap semphore, mlock the page via
528mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the 628mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
529pre-clearing of the page's PG_mlocked done by munlock_vma_page. 629pre-clearing of the page's PG_mlocked done by munlock_vma_page.
530 630
531If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap 631If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
532semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() 632semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
533to recycle the page on the inactive list and hope that it has better luck 633recycle the page on the inactive list and hope that it has better luck with the
534with the page next time. 634page next time.
535 635
536For file pages mapped into non-linear vmas, the try_to_munlock() logic works 636For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
537slightly differently. On encountering a VM_LOCKED non-linear vma that might 637slightly differently. On encountering a VM_LOCKED non-linear VMA that might
538map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking 638map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
539the page. munlock_vma_page() will just leave the page unlocked and let 639page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
540vmscan deal with it--the usual fallback position. 640with it - the usual fallback position.
541 641
542Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' 642Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
543reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. 643reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
544However, the scan can terminate when it encounters a VM_LOCKED vma and can 644However, the scan can terminate when it encounters a VM_LOCKED VMA and can
545successfully acquire the vma's mmap semphore for read and mlock the page. 645successfully acquire the VMA's mmap semphore for read and mlock the page.
546Although try_to_munlock() can be called many [very many!] times when 646Although try_to_munlock() might be called a great many times when munlocking a
547munlock()ing a large region or tearing down a large address space that has been 647large region or tearing down a large address space that has been mlocked via
548mlocked via mlockall(), overall this is a fairly rare event. 648mlockall(), overall this is a fairly rare event.
549 649
550Mlocked Page: Page Reclaim in shrink_*_list() 650
551 651PAGE RECLAIM IN shrink_*_list()
552shrink_active_list() culls any obviously unevictable pages--i.e., 652-------------------------------
553!page_evictable(page, NULL)--diverting these to the unevictable lru 653
554list. However, shrink_active_list() only sees unevictable pages that 654shrink_active_list() culls any obviously unevictable pages - i.e.
555made it onto the active/inactive lru lists. Note that these pages do not 655!page_evictable(page, NULL) - diverting these to the unevictable list.
556have PageUnevictable set--otherwise, they would be on the unevictable list and 656However, shrink_active_list() only sees unevictable pages that made it onto the
557shrink_active_list would never see them. 657active/inactive lru lists. Note that these pages do not have PageUnevictable
658set - otherwise they would be on the unevictable list and shrink_active_list
659would never see them.
558 660
559Some examples of these unevictable pages on the LRU lists are: 661Some examples of these unevictable pages on the LRU lists are:
560 662
5611) ramfs pages that have been placed on the lru lists when first allocated. 663 (1) ramfs pages that have been placed on the LRU lists when first allocated.
664
665 (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to
666 allocate or fault in the pages in the shared memory region. This happens
667 when an application accesses the page the first time after SHM_LOCK'ing
668 the segment.
562 669
5632) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to 670 (3) mlocked pages that could not be isolated from the LRU and moved to the
564 allocate or fault in the pages in the shared memory region. This happens 671 unevictable list in mlock_vma_page().
565 when an application accesses the page the first time after SHM_LOCKing
566 the segment.
567 672
5683) Mlocked pages that could not be isolated from the lru and moved to the 673 (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
569 unevictable list in mlock_vma_page(). 674 acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
675 munlock_vma_page() was forced to let the page back on to the normal LRU
676 list for vmscan to handle.
570 677
5713) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't 678shrink_inactive_list() also diverts any unevictable pages that it finds on the
572 acquire the vma's mmap semaphore to test the flags and set PageMlocked. 679inactive lists to the appropriate zone's unevictable list.
573 munlock_vma_page() was forced to let the page back on to the normal
574 LRU list for vmscan to handle.
575 680
576shrink_inactive_list() also culls any unevictable pages that it finds on 681shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
577the inactive lists, again diverting them to the appropriate zone's unevictable 682after shrink_active_list() had moved them to the inactive list, or pages mapped
578lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became 683into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
579SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or 684recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter,
580pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from 685but will pass on to shrink_page_list().
581the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice
582the latter, but will pass on to shrink_page_list().
583 686
584shrink_page_list() again culls obviously unevictable pages that it could 687shrink_page_list() again culls obviously unevictable pages that it could
585encounter for similar reason to shrink_inactive_list(). Pages mapped into 688encounter for similar reason to shrink_inactive_list(). Pages mapped into
586VM_LOCKED vmas but without PG_mlocked set will make it all the way to 689VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
587try_to_unmap(). shrink_page_list() will divert them to the unevictable list 690try_to_unmap(). shrink_page_list() will divert them to the unevictable list
588when try_to_unmap() returns SWAP_MLOCK, as discussed above. 691when try_to_unmap() returns SWAP_MLOCK, as discussed above.
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 7b4596ac4120..e0203662f9e9 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -158,7 +158,7 @@ Offset Proto Name Meaning
1580202/4 2.00+ header Magic signature "HdrS" 1580202/4 2.00+ header Magic signature "HdrS"
1590206/2 2.00+ version Boot protocol version supported 1590206/2 2.00+ version Boot protocol version supported
1600208/4 2.00+ realmode_swtch Boot loader hook (see below) 1600208/4 2.00+ realmode_swtch Boot loader hook (see below)
161020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) 161020C/2 2.00+ start_sys_seg The load-low segment (0x1000) (obsolete)
162020E/2 2.00+ kernel_version Pointer to kernel version string 162020E/2 2.00+ kernel_version Pointer to kernel version string
1630210/1 2.00+ type_of_loader Boot loader identifier 1630210/1 2.00+ type_of_loader Boot loader identifier
1640211/1 2.00+ loadflags Boot protocol option flags 1640211/1 2.00+ loadflags Boot protocol option flags
@@ -170,10 +170,11 @@ Offset Proto Name Meaning
1700224/2 2.01+ heap_end_ptr Free memory after setup end 1700224/2 2.01+ heap_end_ptr Free memory after setup end
1710226/2 N/A pad1 Unused 1710226/2 N/A pad1 Unused
1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
173022C/4 2.03+ initrd_addr_max Highest legal initrd address 173022C/4 2.03+ ramdisk_max Highest legal initrd address
1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1760235/3 N/A pad2 Unused 1760235/1 N/A pad2 Unused
1770236/2 N/A pad3 Unused
1770238/4 2.06+ cmdline_size Maximum size of the kernel command line 1780238/4 2.06+ cmdline_size Maximum size of the kernel command line
178023C/4 2.07+ hardware_subarch Hardware subarchitecture 179023C/4 2.07+ hardware_subarch Hardware subarchitecture
1790240/8 2.07+ hardware_subarch_data Subarchitecture-specific data 1800240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
@@ -299,14 +300,14 @@ Protocol: 2.00+
299 e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version 300 e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version
300 10.17. 301 10.17.
301 302
302Field name: readmode_swtch 303Field name: realmode_swtch
303Type: modify (optional) 304Type: modify (optional)
304Offset/size: 0x208/4 305Offset/size: 0x208/4
305Protocol: 2.00+ 306Protocol: 2.00+
306 307
307 Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) 308 Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.)
308 309
309Field name: start_sys 310Field name: start_sys_seg
310Type: read 311Type: read
311Offset/size: 0x20c/2 312Offset/size: 0x20c/2
312Protocol: 2.00+ 313Protocol: 2.00+
@@ -468,7 +469,7 @@ Protocol: 2.02+
468 zero, the kernel will assume that your boot loader does not support 469 zero, the kernel will assume that your boot loader does not support
469 the 2.02+ protocol. 470 the 2.02+ protocol.
470 471
471Field name: initrd_addr_max 472Field name: ramdisk_max
472Type: read 473Type: read
473Offset/size: 0x22c/4 474Offset/size: 0x22c/4
474Protocol: 2.03+ 475Protocol: 2.03+
@@ -542,7 +543,10 @@ Protocol: 2.08+
542 543
543 The payload may be compressed. The format of both the compressed and 544 The payload may be compressed. The format of both the compressed and
544 uncompressed data should be determined using the standard magic 545 uncompressed data should be determined using the standard magic
545 numbers. Currently only gzip compressed ELF is used. 546 numbers. The currently supported compression formats are gzip
547 (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A) and LZMA
548 (magic number 5D 00). The uncompressed payload is currently always ELF
549 (magic number 7F 45 4C 46).
546 550
547Field name: payload_length 551Field name: payload_length
548Type: read 552Type: read
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 000000000000..607b1a016064
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
1
2Mini-HOWTO for using the earlyprintk=dbgp boot option with a
3USB2 Debug port key and a debug cable, on x86 systems.
4
5You need two computers, the 'USB debug key' special gadget and
6and two USB cables, connected like this:
7
8 [host/target] <-------> [USB debug key] <-------> [client/console]
9
101. There are three specific hardware requirements:
11
12 a.) Host/target system needs to have USB debug port capability.
13
14 You can check this capability by looking at a 'Debug port' bit in
15 the lspci -vvv output:
16
17 # lspci -vvv
18 ...
19 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
20 Subsystem: Lenovo ThinkPad T61
21 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
22 Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
23 Latency: 0
24 Interrupt: pin D routed to IRQ 19
25 Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
26 Capabilities: [50] Power Management version 2
27 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
28 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
29 Capabilities: [58] Debug port: BAR=1 offset=00a0
30 ^^^^^^^^^^^ <==================== [ HERE ]
31 Kernel driver in use: ehci_hcd
32 Kernel modules: ehci-hcd
33 ...
34
35( If your system does not list a debug port capability then you probably
36 wont be able to use the USB debug key. )
37
38 b.) You also need a Netchip USB debug cable/key:
39
40 http://www.plxtech.com/products/NET2000/NET20DC/default.asp
41
42 This is a small blue plastic connector with two USB connections,
43 it draws power from its USB connections.
44
45 c.) Thirdly, you need a second client/console system with a regular USB port.
46
472. Software requirements:
48
49 a.) On the host/target system:
50
51 You need to enable the following kernel config option:
52
53 CONFIG_EARLY_PRINTK_DBGP=y
54
55 And you need to add the boot command line: "earlyprintk=dbgp".
56 (If you are using Grub, append it to the 'kernel' line in
57 /etc/grub.conf)
58
59 NOTE: normally earlyprintk console gets turned off once the
60 regular console is alive - use "earlyprintk=dbgp,keep" to keep
61 this channel open beyond early bootup. This can be useful for
62 debugging crashes under Xorg, etc.
63
64 b.) On the client/console system:
65
66 You should enable the following kernel config option:
67
68 CONFIG_USB_SERIAL_DEBUG=y
69
70 On the next bootup with the modified kernel you should
71 get a /dev/ttyUSBx device(s).
72
73 Now this channel of kernel messages is ready to be used: start
74 your favorite terminal emulator (minicom, etc.) and set
75 it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
76 see the raw output.
77
78 c.) On Nvidia Southbridge based systems: the kernel will try to probe
79 and find out which port has debug device connected.
80
813. Testing that it works fine:
82
83 You can test the output by using earlyprintk=dbgp,keep and provoking
84 kernel messages on the host/target system. You can provoke a harmless
85 kernel message by for example doing:
86
87 echo h > /proc/sysrq-trigger
88
89 On the host/target system you should see this help line in "dmesg" output:
90
91 SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
92
93 On the client/console system do:
94
95 cat /dev/ttyUSB0
96
97 And you should see the help line above displayed shortly after you've
98 provoked it on the host system.
99
100If it does not work then please ask about it on the linux-kernel@vger.kernel.org
101mailing list or contact the x86 maintainers.
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
index 33bb56655991..0f11d9becb0b 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
@@ -7,7 +7,8 @@ you can create fake NUMA nodes that represent contiguous chunks of memory and
7assign them to cpusets and their attached tasks. This is a way of limiting the 7assign them to cpusets and their attached tasks. This is a way of limiting the
8amount of system memory that are available to a certain class of tasks. 8amount of system memory that are available to a certain class of tasks.
9 9
10For more information on the features of cpusets, see Documentation/cpusets.txt. 10For more information on the features of cpusets, see
11Documentation/cgroups/cpusets.txt.
11There are a number of different configurations you can use for your needs. For 12There are a number of different configurations you can use for your needs. For
12more information on the numa=fake command line option and its various ways of 13more information on the numa=fake command line option and its various ways of
13configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. 14configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -32,7 +33,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:
32 On node 3 totalpages: 131072 33 On node 3 totalpages: 131072
33 34
34Now following the instructions for mounting the cpusets filesystem from 35Now following the instructions for mounting the cpusets filesystem from
35Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory 36Documentation/cgroups/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
36address spaces) to individual cpusets: 37address spaces) to individual cpusets:
37 38
38 [root@xroads /]# mkdir exampleset 39 [root@xroads /]# mkdir exampleset