aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX4
-rw-r--r--Documentation/ABI/testing/debugfs-kmemtrace71
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci70
-rw-r--r--Documentation/ABI/testing/sysfs-class-regulator57
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/DMA-API.txt106
-rw-r--r--Documentation/DocBook/.gitignore4
-rw-r--r--Documentation/DocBook/kernel-api.tmpl3
-rw-r--r--Documentation/DocBook/procfs_example.c9
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt814
-rw-r--r--Documentation/PCI/pci-iov-howto.txt99
-rw-r--r--Documentation/RCU/listRCU.txt6
-rw-r--r--Documentation/RCU/rcu.txt2
-rw-r--r--Documentation/RCU/rculist_nulls.txt4
-rw-r--r--Documentation/Smack.txt42
-rw-r--r--Documentation/cgroups/00-INDEX18
-rw-r--r--Documentation/cgroups/cgroups.txt36
-rw-r--r--Documentation/cgroups/cpusets.txt12
-rw-r--r--Documentation/cgroups/devices.txt2
-rw-r--r--Documentation/cgroups/memcg_test.txt22
-rw-r--r--Documentation/cgroups/memory.txt2
-rw-r--r--Documentation/devices.txt123
-rw-r--r--Documentation/dvb/get_dvb_firmware85
-rw-r--r--Documentation/fb/00-INDEX2
-rw-r--r--Documentation/fb/cyblafb/bugs13
-rw-r--r--Documentation/fb/cyblafb/credits7
-rw-r--r--Documentation/fb/cyblafb/documentation17
-rw-r--r--Documentation/fb/cyblafb/fb.modes154
-rw-r--r--Documentation/fb/cyblafb/performance79
-rw-r--r--Documentation/fb/cyblafb/todo31
-rw-r--r--Documentation/fb/cyblafb/usage217
-rw-r--r--Documentation/fb/cyblafb/whatsnew29
-rw-r--r--Documentation/fb/cyblafb/whycyblafb85
-rw-r--r--Documentation/feature-removal-schedule.txt83
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/caching/backend-api.txt658
-rw-r--r--Documentation/filesystems/caching/cachefiles.txt501
-rw-r--r--Documentation/filesystems/caching/fscache.txt333
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt778
-rw-r--r--Documentation/filesystems/caching/object.txt313
-rw-r--r--Documentation/filesystems/caching/operations.txt213
-rw-r--r--Documentation/filesystems/exofs.txt176
-rw-r--r--Documentation/filesystems/ext3.txt14
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/knfsd-stats.txt159
-rw-r--r--Documentation/filesystems/nfs41-server.txt161
-rw-r--r--Documentation/filesystems/pohmelfs/design_notes.txt70
-rw-r--r--Documentation/filesystems/pohmelfs/info.txt86
-rw-r--r--Documentation/filesystems/pohmelfs/network_protocol.txt227
-rw-r--r--Documentation/filesystems/proc.txt1118
-rw-r--r--Documentation/filesystems/sysfs-pci.txt10
-rw-r--r--Documentation/filesystems/udf.txt2
-rw-r--r--Documentation/ftrace.txt1134
-rw-r--r--Documentation/gpio.txt23
-rw-r--r--Documentation/hwmon/ds162151
-rw-r--r--Documentation/hwmon/lis3lv02d20
-rw-r--r--Documentation/hwmon/ltc421550
-rw-r--r--Documentation/hwmon/pcf8591 (renamed from Documentation/i2c/chips/pcf8591)0
-rw-r--r--Documentation/hwmon/sysfs-interface22
-rw-r--r--Documentation/hwmon/w83627ehf29
-rw-r--r--Documentation/ia64/kvm.txt2
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/kernel-parameters.txt526
-rw-r--r--Documentation/laptops/acer-wmi.txt10
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt144
-rw-r--r--Documentation/lguest/lguest.c7
-rw-r--r--Documentation/lockdep-design.txt30
-rw-r--r--Documentation/md.txt37
-rw-r--r--Documentation/misc-devices/isl2900362
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/dma.txt34
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/esdhc.txt24
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/ssi.txt68
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/upm-nand.txt39
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/led.txt46
-rw-r--r--Documentation/powerpc/dts-bindings/mmc-spi-slot.txt23
-rw-r--r--Documentation/scheduler/sched-rt-group.txt2
-rw-r--r--Documentation/scsi/aacraid.txt15
-rw-r--r--Documentation/slow-work.txt174
-rw-r--r--Documentation/sysctl/00-INDEX2
-rw-r--r--Documentation/sysctl/fs.txt74
-rw-r--r--Documentation/sysctl/kernel.txt53
-rw-r--r--Documentation/sysctl/net.txt175
-rw-r--r--Documentation/sysrq.txt7
-rw-r--r--Documentation/tracepoints.txt21
-rw-r--r--Documentation/video4linux/CARDLIST.bttv6
-rw-r--r--Documentation/video4linux/CARDLIST.cx238854
-rw-r--r--Documentation/video4linux/CARDLIST.cx881
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx9
-rw-r--r--Documentation/video4linux/CARDLIST.saa71342
-rw-r--r--Documentation/video4linux/Zoran3
-rw-r--r--Documentation/video4linux/bttv/Insmod-options10
-rw-r--r--Documentation/video4linux/bttv/README4
-rw-r--r--Documentation/video4linux/cx2341x/README.hm124
-rw-r--r--Documentation/video4linux/gspca.txt4
-rw-r--r--Documentation/video4linux/si470x.txt11
-rw-r--r--Documentation/video4linux/v4l2-framework.txt187
-rw-r--r--Documentation/video4linux/v4lgrab.c4
-rw-r--r--Documentation/video4linux/zr364xx.txt1
-rw-r--r--Documentation/vm/kmemtrace.txt126
-rw-r--r--Documentation/vm/numa_memory_policy.txt3
-rw-r--r--Documentation/vm/page_migration3
-rw-r--r--Documentation/x86/earlyprintk.txt101
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets5
104 files changed, 7388 insertions, 3248 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 2a39aeba1464..d05737aaa84b 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -86,6 +86,8 @@ cachetlb.txt
86 - describes the cache/TLB flushing interfaces Linux uses. 86 - describes the cache/TLB flushing interfaces Linux uses.
87cdrom/ 87cdrom/
88 - directory with information on the CD-ROM drivers that Linux has. 88 - directory with information on the CD-ROM drivers that Linux has.
89cgroups/
90 - cgroups features, including cpusets and memory controller.
89connector/ 91connector/
90 - docs on the netlink based userspace<->kernel space communication mod. 92 - docs on the netlink based userspace<->kernel space communication mod.
91console/ 93console/
@@ -98,8 +100,6 @@ cpu-load.txt
98 - document describing how CPU load statistics are collected. 100 - document describing how CPU load statistics are collected.
99cpuidle/ 101cpuidle/
100 - info on CPU_IDLE, CPU idle state management subsystem. 102 - info on CPU_IDLE, CPU idle state management subsystem.
101cpusets.txt
102 - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
103cputopology.txt 103cputopology.txt
104 - documentation on how CPU topology info is exported via sysfs. 104 - documentation on how CPU topology info is exported via sysfs.
105cris/ 105cris/
diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
new file mode 100644
index 000000000000..5e6a92a02d85
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-kmemtrace
@@ -0,0 +1,71 @@
1What: /sys/kernel/debug/kmemtrace/
2Date: July 2008
3Contact: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
4Description:
5
6In kmemtrace-enabled kernels, the following files are created:
7
8/sys/kernel/debug/kmemtrace/
9 cpu<n> (0400) Per-CPU tracing data, see below. (binary)
10 total_overruns (0400) Total number of bytes which were dropped from
11 cpu<n> files because of full buffer condition,
12 non-binary. (text)
13 abi_version (0400) Kernel's kmemtrace ABI version. (text)
14
15Each per-CPU file should be read according to the relay interface. That is,
16the reader should set affinity to that specific CPU and, as currently done by
17the userspace application (though there are other methods), use poll() with
18an infinite timeout before every read(). Otherwise, erroneous data may be
19read. The binary data has the following _core_ format:
20
21 Event ID (1 byte) Unsigned integer, one of:
22 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
23 1 - represents a freeing of previously allocated memory
24 (KMEMTRACE_EVENT_FREE)
25 Type ID (1 byte) Unsigned integer, one of:
26 0 - this is a kmalloc() / kfree()
27 1 - this is a kmem_cache_alloc() / kmem_cache_free()
28 2 - this is a __get_free_pages() et al.
29 Event size (2 bytes) Unsigned integer representing the
30 size of this event. Used to extend
31 kmemtrace. Discard the bytes you
32 don't know about.
33 Sequence number (4 bytes) Signed integer used to reorder data
34 logged on SMP machines. Wraparound
35 must be taken into account, although
36 it is unlikely.
37 Caller address (8 bytes) Return address to the caller.
38 Pointer to mem (8 bytes) Pointer to target memory area. Can be
39 NULL, but not all such calls might be
40 recorded.
41
42In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
43
44 Requested bytes (8 bytes) Total number of requested bytes,
45 unsigned, must not be zero.
46 Allocated bytes (8 bytes) Total number of actually allocated
47 bytes, unsigned, must not be lower
48 than requested bytes.
49 Requested flags (4 bytes) GFP flags supplied by the caller.
50 Target CPU (4 bytes) Signed integer, valid for event id 1.
51 If equal to -1, target CPU is the same
52 as origin CPU, but the reverse might
53 not be true.
54
55The data is made available in the same endianness the machine has.
56
57Other event ids and type ids may be defined and added. Other fields may be
58added by increasing event size, but see below for details.
59Every modification to the ABI, including new id definitions, are followed
60by bumping the ABI version by one.
61
62Adding new data to the packet (features) is done at the end of the mandatory
63data:
64 Feature size (2 byte)
65 Feature ID (1 byte)
66 Feature data (Feature size - 3 bytes)
67
68
69Users:
70 kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
71
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index e638e15a8895..97ad190e13af 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -41,6 +41,49 @@ Description:
41 for the device and attempt to bind to it. For example: 41 for the device and attempt to bind to it. For example:
42 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id 42 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id
43 43
44What: /sys/bus/pci/drivers/.../remove_id
45Date: February 2009
46Contact: Chris Wright <chrisw@sous-sol.org>
47Description:
48 Writing a device ID to this file will remove an ID
49 that was dynamically added via the new_id sysfs entry.
50 The format for the device ID is:
51 VVVV DDDD SVVV SDDD CCCC MMMM. That is Vendor ID, Device
52 ID, Subsystem Vendor ID, Subsystem Device ID, Class,
53 and Class Mask. The Vendor ID and Device ID fields are
54 required, the rest are optional. After successfully
55 removing an ID, the driver will no longer support the
56 device. This is useful to ensure auto probing won't
57 match the driver to the device. For example:
58 # echo "8086 10f5" > /sys/bus/pci/drivers/foo/remove_id
59
60What: /sys/bus/pci/rescan
61Date: January 2009
62Contact: Linux PCI developers <linux-pci@vger.kernel.org>
63Description:
64 Writing a non-zero value to this attribute will
65 force a rescan of all PCI buses in the system, and
66 re-discover previously removed devices.
67 Depends on CONFIG_HOTPLUG.
68
69What: /sys/bus/pci/devices/.../remove
70Date: January 2009
71Contact: Linux PCI developers <linux-pci@vger.kernel.org>
72Description:
73 Writing a non-zero value to this attribute will
74 hot-remove the PCI device and any of its children.
75 Depends on CONFIG_HOTPLUG.
76
77What: /sys/bus/pci/devices/.../rescan
78Date: January 2009
79Contact: Linux PCI developers <linux-pci@vger.kernel.org>
80Description:
81 Writing a non-zero value to this attribute will
82 force a rescan of the device's parent bus and all
83 child buses, and re-discover devices removed earlier
84 from this part of the device tree.
85 Depends on CONFIG_HOTPLUG.
86
44What: /sys/bus/pci/devices/.../vpd 87What: /sys/bus/pci/devices/.../vpd
45Date: February 2008 88Date: February 2008
46Contact: Ben Hutchings <bhutchings@solarflare.com> 89Contact: Ben Hutchings <bhutchings@solarflare.com>
@@ -52,3 +95,30 @@ Description:
52 that some devices may have malformatted data. If the 95 that some devices may have malformatted data. If the
53 underlying VPD has a writable section then the 96 underlying VPD has a writable section then the
54 corresponding section of this file will be writable. 97 corresponding section of this file will be writable.
98
99What: /sys/bus/pci/devices/.../virtfnN
100Date: March 2009
101Contact: Yu Zhao <yu.zhao@intel.com>
102Description:
103 This symbolic link appears when hardware supports the SR-IOV
104 capability and the Physical Function driver has enabled it.
105 The symbolic link points to the PCI device sysfs entry of the
106 Virtual Function whose index is N (0...MaxVFs-1).
107
108What: /sys/bus/pci/devices/.../dep_link
109Date: March 2009
110Contact: Yu Zhao <yu.zhao@intel.com>
111Description:
112 This symbolic link appears when hardware supports the SR-IOV
113 capability and the Physical Function driver has enabled it,
114 and this device has vendor specific dependencies with others.
115 The symbolic link points to the PCI device sysfs entry of
116 Physical Function this device depends on.
117
118What: /sys/bus/pci/devices/.../physfn
119Date: March 2009
120Contact: Yu Zhao <yu.zhao@intel.com>
121Description:
122 This symbolic link appears when a device is a Virtual Function.
123 The symbolic link points to the PCI device sysfs entry of the
124 Physical Function this device associates with.
diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 873ef1fc1569..e091fa873792 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -4,8 +4,8 @@ KernelVersion: 2.6.26
4Contact: Liam Girdwood <lrg@slimlogic.co.uk> 4Contact: Liam Girdwood <lrg@slimlogic.co.uk>
5Description: 5Description:
6 Some regulator directories will contain a field called 6 Some regulator directories will contain a field called
7 state. This reports the regulator enable status, for 7 state. This reports the regulator enable control, for
8 regulators which can report that value. 8 regulators which can report that input value.
9 9
10 This will be one of the following strings: 10 This will be one of the following strings:
11 11
@@ -14,16 +14,54 @@ Description:
14 'unknown' 14 'unknown'
15 15
16 'enabled' means the regulator output is ON and is supplying 16 'enabled' means the regulator output is ON and is supplying
17 power to the system. 17 power to the system (assuming no error prevents it).
18 18
19 'disabled' means the regulator output is OFF and is not 19 'disabled' means the regulator output is OFF and is not
20 supplying power to the system.. 20 supplying power to the system (unless some non-Linux
21 control has enabled it).
21 22
22 'unknown' means software cannot determine the state, or 23 'unknown' means software cannot determine the state, or
23 the reported state is invalid. 24 the reported state is invalid.
24 25
25 NOTE: this field can be used in conjunction with microvolts 26 NOTE: this field can be used in conjunction with microvolts
26 and microamps to determine regulator output levels. 27 or microamps to determine configured regulator output levels.
28
29
30What: /sys/class/regulator/.../status
31Description:
32 Some regulator directories will contain a field called
33 "status". This reports the current regulator status, for
34 regulators which can report that output value.
35
36 This will be one of the following strings:
37
38 off
39 on
40 error
41 fast
42 normal
43 idle
44 standby
45
46 "off" means the regulator is not supplying power to the
47 system.
48
49 "on" means the regulator is supplying power to the system,
50 and the regulator can't report a detailed operation mode.
51
52 "error" indicates an out-of-regulation status such as being
53 disabled due to thermal shutdown, or voltage being unstable
54 because of problems with the input power supply.
55
56 "fast", "normal", "idle", and "standby" are all detailed
57 regulator operation modes (described elsewhere). They
58 imply "on", but provide more detail.
59
60 Note that regulator status is a function of many inputs,
61 not limited to control inputs from Linux. For example,
62 the actual load presented may trigger "error" status; or
63 a regulator may be enabled by another user, even though
64 Linux did not enable it.
27 65
28 66
29What: /sys/class/regulator/.../type 67What: /sys/class/regulator/.../type
@@ -58,7 +96,7 @@ Description:
58 Some regulator directories will contain a field called 96 Some regulator directories will contain a field called
59 microvolts. This holds the regulator output voltage setting 97 microvolts. This holds the regulator output voltage setting
60 measured in microvolts (i.e. E-6 Volts), for regulators 98 measured in microvolts (i.e. E-6 Volts), for regulators
61 which can report that voltage. 99 which can report the control input for voltage.
62 100
63 NOTE: This value should not be used to determine the regulator 101 NOTE: This value should not be used to determine the regulator
64 output voltage level as this value is the same regardless of 102 output voltage level as this value is the same regardless of
@@ -73,7 +111,7 @@ Description:
73 Some regulator directories will contain a field called 111 Some regulator directories will contain a field called
74 microamps. This holds the regulator output current limit 112 microamps. This holds the regulator output current limit
75 setting measured in microamps (i.e. E-6 Amps), for regulators 113 setting measured in microamps (i.e. E-6 Amps), for regulators
76 which can report that current. 114 which can report the control input for a current limit.
77 115
78 NOTE: This value should not be used to determine the regulator 116 NOTE: This value should not be used to determine the regulator
79 output current level as this value is the same regardless of 117 output current level as this value is the same regardless of
@@ -87,7 +125,7 @@ Contact: Liam Girdwood <lrg@slimlogic.co.uk>
87Description: 125Description:
88 Some regulator directories will contain a field called 126 Some regulator directories will contain a field called
89 opmode. This holds the current regulator operating mode, 127 opmode. This holds the current regulator operating mode,
90 for regulators which can report it. 128 for regulators which can report that control input value.
91 129
92 The opmode value can be one of the following strings: 130 The opmode value can be one of the following strings:
93 131
@@ -101,7 +139,8 @@ Description:
101 139
102 NOTE: This value should not be used to determine the regulator 140 NOTE: This value should not be used to determine the regulator
103 output operating mode as this value is the same regardless of 141 output operating mode as this value is the same regardless of
104 whether the regulator is enabled or disabled. 142 whether the regulator is enabled or disabled. A "status"
143 attribute may be available to determine the actual mode.
105 144
106 145
107What: /sys/class/regulator/.../min_microvolts 146What: /sys/class/regulator/.../min_microvolts
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 000000000000..4e79074de282
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
1What: /sys/fs/ext4/<disk>/mb_stats
2Date: March 2008
3Contact: "Theodore Ts'o" <tytso@mit.edu>
4Description:
5 Controls whether the multiblock allocator should
6 collect statistics, which are shown during the unmount.
7 1 means to collect statistics, 0 means not to collect
8 statistics
9
10What: /sys/fs/ext4/<disk>/mb_group_prealloc
11Date: March 2008
12Contact: "Theodore Ts'o" <tytso@mit.edu>
13Description:
14 The multiblock allocator will round up allocation
15 requests to a multiple of this tuning parameter if the
16 stripe size is not set in the ext4 superblock
17
18What: /sys/fs/ext4/<disk>/mb_max_to_scan
19Date: March 2008
20Contact: "Theodore Ts'o" <tytso@mit.edu>
21Description:
22 The maximum number of extents the multiblock allocator
23 will search to find the best extent
24
25What: /sys/fs/ext4/<disk>/mb_min_to_scan
26Date: March 2008
27Contact: "Theodore Ts'o" <tytso@mit.edu>
28Description:
29 The minimum number of extents the multiblock allocator
30 will search to find the best extent
31
32What: /sys/fs/ext4/<disk>/mb_order2_req
33Date: March 2008
34Contact: "Theodore Ts'o" <tytso@mit.edu>
35Description:
36 Tuning parameter which controls the minimum size for
37 requests (as a power of 2) where the buddy cache is
38 used
39
40What: /sys/fs/ext4/<disk>/mb_stream_req
41Date: March 2008
42Contact: "Theodore Ts'o" <tytso@mit.edu>
43Description:
44 Files which have fewer blocks than this tunable
45 parameter will have their blocks allocated out of a
46 block group specific preallocation pool, so that small
47 files are packed closely together. Each large file
48 will have its blocks allocated out of its own unique
49 preallocation pool.
50
51What: /sys/fs/ext4/<disk>/inode_readahead
52Date: March 2008
53Contact: "Theodore Ts'o" <tytso@mit.edu>
54Description:
55 Tuning parameter which controls the maximum number of
56 inode table blocks that ext4's inode table readahead
57 algorithm will pre-read into the buffer cache
58
59What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
60Date: March 2008
61Contact: "Theodore Ts'o" <tytso@mit.edu>
62Description:
63 This file is read-only and shows the number of blocks
64 that are dirty in the page cache, but which do not
65 have their location in the filesystem allocated yet.
66
67What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
68Date: March 2008
69Contact: "Theodore Ts'o" <tytso@mit.edu>
70Description:
71 This file is read-only and shows the number of kilobytes
72 of data that have been written to this filesystem since it was
73 created.
74
75What: /sys/fs/ext4/<disk>/session_write_kbytes
76Date: March 2008
77Contact: "Theodore Ts'o" <tytso@mit.edu>
78Description:
79 This file is read-only and shows the number of
80 kilobytes of data that have been written to this
81 filesystem since it was mounted.
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 2a3fcc55e981..d9aa43d78bcc 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -609,3 +609,109 @@ size is the size (and should be a page-sized multiple).
609The return value will be either a pointer to the processor virtual 609The return value will be either a pointer to the processor virtual
610address of the memory, or an error (via PTR_ERR()) if any part of the 610address of the memory, or an error (via PTR_ERR()) if any part of the
611region is occupied. 611region is occupied.
612
613Part III - Debug drivers use of the DMA-API
614-------------------------------------------
615
616The DMA-API as described above as some constraints. DMA addresses must be
617released with the corresponding function with the same size for example. With
618the advent of hardware IOMMUs it becomes more and more important that drivers
619do not violate those constraints. In the worst case such a violation can
620result in data corruption up to destroyed filesystems.
621
622To debug drivers and find bugs in the usage of the DMA-API checking code can
623be compiled into the kernel which will tell the developer about those
624violations. If your architecture supports it you can select the "Enable
625debugging of DMA-API usage" option in your kernel configuration. Enabling this
626option has a performance impact. Do not enable it in production kernels.
627
628If you boot the resulting kernel will contain code which does some bookkeeping
629about what DMA memory was allocated for which device. If this code detects an
630error it prints a warning message with some details into your kernel log. An
631example warning message may look like this:
632
633------------[ cut here ]------------
634WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
635 check_unmap+0x203/0x490()
636Hardware name:
637forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
638 function [device address=0x00000000640444be] [size=66 bytes] [mapped as
639single] [unmapped as page]
640Modules linked in: nfsd exportfs bridge stp llc r8169
641Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1
642Call Trace:
643 <IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
644 [<ffffffff80647b70>] _spin_unlock+0x10/0x30
645 [<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
646 [<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
647 [<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
648 [<ffffffff80252f96>] queue_work+0x56/0x60
649 [<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
650 [<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
651 [<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
652 [<ffffffff80235177>] find_busiest_group+0x207/0x8a0
653 [<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
654 [<ffffffff803c7ea3>] check_unmap+0x203/0x490
655 [<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
656 [<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
657 [<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
658 [<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
659 [<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
660 [<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
661 [<ffffffff8020c093>] ret_from_intr+0x0/0xa
662 <EOI> <4>---[ end trace f6435a98e2a38c0e ]---
663
664The driver developer can find the driver and the device including a stacktrace
665of the DMA-API call which caused this warning.
666
667Per default only the first error will result in a warning message. All other
668errors will only silently counted. This limitation exist to prevent the code
669from flooding your kernel log. To support debugging a device driver this can
670be disabled via debugfs. See the debugfs interface documentation below for
671details.
672
673The debugfs directory for the DMA-API debugging code is called dma-api/. In
674this directory the following files can currently be found:
675
676 dma-api/all_errors This file contains a numeric value. If this
677 value is not equal to zero the debugging code
678 will print a warning for every error it finds
679 into the kernel log. Be carefull with this
680 option. It can easily flood your logs.
681
682 dma-api/disabled This read-only file contains the character 'Y'
683 if the debugging code is disabled. This can
684 happen when it runs out of memory or if it was
685 disabled at boot time
686
687 dma-api/error_count This file is read-only and shows the total
688 numbers of errors found.
689
690 dma-api/num_errors The number in this file shows how many
691 warnings will be printed to the kernel log
692 before it stops. This number is initialized to
693 one at system boot and be set by writing into
694 this file
695
696 dma-api/min_free_entries
697 This read-only file can be read to get the
698 minimum number of free dma_debug_entries the
699 allocator has ever seen. If this value goes
700 down to zero the code will disable itself
701 because it is not longer reliable.
702
703 dma-api/num_free_entries
704 The current number of free dma_debug_entries
705 in the allocator.
706
707If you have this code compiled into your kernel it will be enabled by default.
708If you want to boot without the bookkeeping anyway you can provide
709'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
710Notice that you can not enable it again at runtime. You have to reboot to do
711so.
712
713When the code disables itself at runtime this is most likely because it ran
714out of dma_debug_entries. These entries are preallocated at boot. The number
715of preallocated entries is defined per architecture. If it is too low for you
716boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
717architectural default.
diff --git a/Documentation/DocBook/.gitignore b/Documentation/DocBook/.gitignore
index c102c02ecf89..c6def352fe39 100644
--- a/Documentation/DocBook/.gitignore
+++ b/Documentation/DocBook/.gitignore
@@ -4,3 +4,7 @@
4*.html 4*.html
5*.9.gz 5*.9.gz
6*.9 6*.9
7*.aux
8*.dvi
9*.log
10*.out
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index bc962cda6504..d6ac5d61820e 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -199,6 +199,7 @@ X!Edrivers/pci/hotplug.c
199--> 199-->
200!Edrivers/pci/probe.c 200!Edrivers/pci/probe.c
201!Edrivers/pci/rom.c 201!Edrivers/pci/rom.c
202!Edrivers/pci/iov.c
202 </sect1> 203 </sect1>
203 <sect1><title>PCI Hotplug Support Library</title> 204 <sect1><title>PCI Hotplug Support Library</title>
204!Edrivers/pci/hotplug/pci_hotplug_core.c 205!Edrivers/pci/hotplug/pci_hotplug_core.c
@@ -258,7 +259,7 @@ X!Earch/x86/kernel/mca_32.c
258!Eblock/blk-tag.c 259!Eblock/blk-tag.c
259!Iblock/blk-tag.c 260!Iblock/blk-tag.c
260!Eblock/blk-integrity.c 261!Eblock/blk-integrity.c
261!Iblock/blktrace.c 262!Ikernel/trace/blktrace.c
262!Iblock/genhd.c 263!Iblock/genhd.c
263!Eblock/genhd.c 264!Eblock/genhd.c
264 </chapter> 265 </chapter>
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
index 8c6396e4bf31..a5b11793b1e0 100644
--- a/Documentation/DocBook/procfs_example.c
+++ b/Documentation/DocBook/procfs_example.c
@@ -117,9 +117,6 @@ static int __init init_procfs_example(void)
117 rv = -ENOMEM; 117 rv = -ENOMEM;
118 goto out; 118 goto out;
119 } 119 }
120
121 example_dir->owner = THIS_MODULE;
122
123 /* create jiffies using convenience function */ 120 /* create jiffies using convenience function */
124 jiffies_file = create_proc_read_entry("jiffies", 121 jiffies_file = create_proc_read_entry("jiffies",
125 0444, example_dir, 122 0444, example_dir,
@@ -130,8 +127,6 @@ static int __init init_procfs_example(void)
130 goto no_jiffies; 127 goto no_jiffies;
131 } 128 }
132 129
133 jiffies_file->owner = THIS_MODULE;
134
135 /* create foo and bar files using same callback 130 /* create foo and bar files using same callback
136 * functions 131 * functions
137 */ 132 */
@@ -146,7 +141,6 @@ static int __init init_procfs_example(void)
146 foo_file->data = &foo_data; 141 foo_file->data = &foo_data;
147 foo_file->read_proc = proc_read_foobar; 142 foo_file->read_proc = proc_read_foobar;
148 foo_file->write_proc = proc_write_foobar; 143 foo_file->write_proc = proc_write_foobar;
149 foo_file->owner = THIS_MODULE;
150 144
151 bar_file = create_proc_entry("bar", 0644, example_dir); 145 bar_file = create_proc_entry("bar", 0644, example_dir);
152 if(bar_file == NULL) { 146 if(bar_file == NULL) {
@@ -159,7 +153,6 @@ static int __init init_procfs_example(void)
159 bar_file->data = &bar_data; 153 bar_file->data = &bar_data;
160 bar_file->read_proc = proc_read_foobar; 154 bar_file->read_proc = proc_read_foobar;
161 bar_file->write_proc = proc_write_foobar; 155 bar_file->write_proc = proc_write_foobar;
162 bar_file->owner = THIS_MODULE;
163 156
164 /* create symlink */ 157 /* create symlink */
165 symlink = proc_symlink("jiffies_too", example_dir, 158 symlink = proc_symlink("jiffies_too", example_dir,
@@ -169,8 +162,6 @@ static int __init init_procfs_example(void)
169 goto no_symlink; 162 goto no_symlink;
170 } 163 }
171 164
172 symlink->owner = THIS_MODULE;
173
174 /* everything OK */ 165 /* everything OK */
175 printk(KERN_INFO "%s %s initialised\n", 166 printk(KERN_INFO "%s %s initialised\n",
176 MODULE_NAME, MODULE_VERS); 167 MODULE_NAME, MODULE_VERS);
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 256defd7e174..dcf7acc720e1 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -4,506 +4,356 @@
4 Revised Feb 12, 2004 by Martine Silbermann 4 Revised Feb 12, 2004 by Martine Silbermann
5 email: Martine.Silbermann@hp.com 5 email: Martine.Silbermann@hp.com
6 Revised Jun 25, 2004 by Tom L Nguyen 6 Revised Jun 25, 2004 by Tom L Nguyen
7 Revised Jul 9, 2008 by Matthew Wilcox <willy@linux.intel.com>
8 Copyright 2003, 2008 Intel Corporation
7 9
81. About this guide 101. About this guide
9 11
10This guide describes the basics of Message Signaled Interrupts (MSI), 12This guide describes the basics of Message Signaled Interrupts (MSIs),
11the advantages of using MSI over traditional interrupt mechanisms, 13the advantages of using MSI over traditional interrupt mechanisms, how
12and how to enable your driver to use MSI or MSI-X. Also included is 14to change your driver to use MSI or MSI-X and some basic diagnostics to
13a Frequently Asked Questions (FAQ) section. 15try if a device doesn't support MSIs.
14
151.1 Terminology
16
17PCI devices can be single-function or multi-function. In either case,
18when this text talks about enabling or disabling MSI on a "device
19function," it is referring to one specific PCI device and function and
20not to all functions on a PCI device (unless the PCI device has only
21one function).
22
232. Copyright 2003 Intel Corporation
24
253. What is MSI/MSI-X?
26
27Message Signaled Interrupt (MSI), as described in the PCI Local Bus
28Specification Revision 2.3 or later, is an optional feature, and a
29required feature for PCI Express devices. MSI enables a device function
30to request service by sending an Inbound Memory Write on its PCI bus to
31the FSB as a Message Signal Interrupt transaction. Because MSI is
32generated in the form of a Memory Write, all transaction conditions,
33such as a Retry, Master-Abort, Target-Abort or normal completion, are
34supported.
35
36A PCI device that supports MSI must also support pin IRQ assertion
37interrupt mechanism to provide backward compatibility for systems that
38do not support MSI. In systems which support MSI, the bus driver is
39responsible for initializing the message address and message data of
40the device function's MSI/MSI-X capability structure during device
41initial configuration.
42
43An MSI capable device function indicates MSI support by implementing
44the MSI/MSI-X capability structure in its PCI capability list. The
45device function may implement both the MSI capability structure and
46the MSI-X capability structure; however, the bus driver should not
47enable both.
48
49The MSI capability structure contains Message Control register,
50Message Address register and Message Data register. These registers
51provide the bus driver control over MSI. The Message Control register
52indicates the MSI capability supported by the device. The Message
53Address register specifies the target address and the Message Data
54register specifies the characteristics of the message. To request
55service, the device function writes the content of the Message Data
56register to the target address. The device and its software driver
57are prohibited from writing to these registers.
58
59The MSI-X capability structure is an optional extension to MSI. It
60uses an independent and separate capability structure. There are
61some key advantages to implementing the MSI-X capability structure
62over the MSI capability structure as described below.
63
64 - Support a larger maximum number of vectors per function.
65
66 - Provide the ability for system software to configure
67 each vector with an independent message address and message
68 data, specified by a table that resides in Memory Space.
69
70 - MSI and MSI-X both support per-vector masking. Per-vector
71 masking is an optional extension of MSI but a required
72 feature for MSI-X. Per-vector masking provides the kernel the
73 ability to mask/unmask a single MSI while running its
74 interrupt service routine. If per-vector masking is
75 not supported, then the device driver should provide the
76 hardware/software synchronization to ensure that the device
77 generates MSI when the driver wants it to do so.
78
794. Why use MSI?
80
81As a benefit to the simplification of board design, MSI allows board
82designers to remove out-of-band interrupt routing. MSI is another
83step towards a legacy-free environment.
84
85Due to increasing pressure on chipset and processor packages to
86reduce pin count, the need for interrupt pins is expected to
87diminish over time. Devices, due to pin constraints, may implement
88messages to increase performance.
89
90PCI Express endpoints uses INTx emulation (in-band messages) instead
91of IRQ pin assertion. Using INTx emulation requires interrupt
92sharing among devices connected to the same node (PCI bridge) while
93MSI is unique (non-shared) and does not require BIOS configuration
94support. As a result, the PCI Express technology requires MSI
95support for better interrupt performance.
96
97Using MSI enables the device functions to support two or more
98vectors, which can be configured to target different CPUs to
99increase scalability.
100
1015. Configuring a driver to use MSI/MSI-X
102
103By default, the kernel will not enable MSI/MSI-X on all devices that
104support this capability. The CONFIG_PCI_MSI kernel option
105must be selected to enable MSI/MSI-X support.
106
1075.1 Including MSI/MSI-X support into the kernel
108
109To allow MSI/MSI-X capable device drivers to selectively enable
110MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described
111below), the VECTOR based scheme needs to be enabled by setting
112CONFIG_PCI_MSI during kernel config.
113
114Since the target of the inbound message is the local APIC, providing
115CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI.
116
1175.2 Configuring for MSI support
118
119Due to the non-contiguous fashion in vector assignment of the
120existing Linux kernel, this version does not support multiple
121messages regardless of a device function is capable of supporting
122more than one vector. To enable MSI on a device function's MSI
123capability structure requires a device driver to call the function
124pci_enable_msi() explicitly.
125
1265.2.1 API pci_enable_msi
127 16
128int pci_enable_msi(struct pci_dev *dev)
129 17
130With this new API, a device driver that wants to have MSI 182. What are MSIs?
131enabled on its device function must call this API to enable MSI.
132A successful call will initialize the MSI capability structure
133with ONE vector, regardless of whether a device function is
134capable of supporting multiple messages. This vector replaces the
135pre-assigned dev->irq with a new MSI vector. To avoid a conflict
136of the new assigned vector with existing pre-assigned vector requires
137a device driver to call this API before calling request_irq().
138 19
1395.2.2 API pci_disable_msi 20A Message Signaled Interrupt is a write from the device to a special
21address which causes an interrupt to be received by the CPU.
140 22
141void pci_disable_msi(struct pci_dev *dev) 23The MSI capability was first specified in PCI 2.2 and was later enhanced
24in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X
25capability was also introduced with PCI 3.0. It supports more interrupts
26per device than MSI and allows interrupts to be independently configured.
142 27
143This API should always be used to undo the effect of pci_enable_msi() 28Devices may support both MSI and MSI-X, but only one can be enabled at
144when a device driver is unloading. This API restores dev->irq with 29a time.
145the pre-assigned IOAPIC vector and switches a device's interrupt
146mode to PCI pin-irq assertion/INTx emulation mode.
147
148Note that a device driver should always call free_irq() on the MSI vector
149that it has done request_irq() on before calling this API. Failure to do
150so results in a BUG_ON() and a device will be left with MSI enabled and
151leaks its vector.
152
1535.2.3 MSI mode vs. legacy mode diagram
154
155The below diagram shows the events which switch the interrupt
156mode on the MSI-capable device function between MSI mode and
157PIN-IRQ assertion mode.
158
159 ------------ pci_enable_msi ------------------------
160 | | <=============== | |
161 | MSI MODE | | PIN-IRQ ASSERTION MODE |
162 | | ===============> | |
163 ------------ pci_disable_msi ------------------------
164
165
166Figure 1. MSI Mode vs. Legacy Mode
167
168In Figure 1, a device operates by default in legacy mode. Legacy
169in this context means PCI pin-irq assertion or PCI-Express INTx
170emulation. A successful MSI request (using pci_enable_msi()) switches
171a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
172stored in dev->irq will be saved by the PCI subsystem and a new
173assigned MSI vector will replace dev->irq.
174
175To return back to its default mode, a device driver should always call
176pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
177device driver should always call free_irq() on the MSI vector it has
178done request_irq() on before calling pci_disable_msi(). Failure to do
179so results in a BUG_ON() and a device will be left with MSI enabled and
180leaks its vector. Otherwise, the PCI subsystem restores a device's
181dev->irq with a pre-assigned IOAPIC vector and marks the released
182MSI vector as unused.
183
184Once being marked as unused, there is no guarantee that the PCI
185subsystem will reserve this MSI vector for a device. Depending on
186the availability of current PCI vector resources and the number of
187MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
188
189For the case where the PCI subsystem re-assigns this MSI vector to
190another driver, a request to switch back to MSI mode may result
191in being assigned a different MSI vector or a failure if no more
192vectors are available.
193
1945.3 Configuring for MSI-X support
195
196Due to the ability of the system software to configure each vector of
197the MSI-X capability structure with an independent message address
198and message data, the non-contiguous fashion in vector assignment of
199the existing Linux kernel has no impact on supporting multiple
200messages on an MSI-X capable device functions. To enable MSI-X on
201a device function's MSI-X capability structure requires its device
202driver to call the function pci_enable_msix() explicitly.
203
204The function pci_enable_msix(), once invoked, enables either
205all or nothing, depending on the current availability of PCI vector
206resources. If the PCI vector resources are available for the number
207of vectors requested by a device driver, this function will configure
208the MSI-X table of the MSI-X capability structure of a device with
209requested messages. To emphasize this reason, for example, a device
210may be capable for supporting the maximum of 32 vectors while its
211software driver usually may request 4 vectors. It is recommended
212that the device driver should call this function once during the
213initialization phase of the device driver.
214
215Unlike the function pci_enable_msi(), the function pci_enable_msix()
216does not replace the pre-assigned IOAPIC dev->irq with a new MSI
217vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
218into the field vector of each element contained in a second argument.
219Note that the pre-assigned IOAPIC dev->irq is valid only if the device
220operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
221using dev->irq by the device driver to request for interrupt service
222may result in unpredictable behavior.
223
224For each MSI-X vector granted, a device driver is responsible for calling
225other functions like request_irq(), enable_irq(), etc. to enable
226this vector with its corresponding interrupt service handler. It is
227a device driver's choice to assign all vectors with the same
228interrupt service handler or each vector with a unique interrupt
229service handler.
230
2315.3.1 Handling MMIO address space of MSI-X Table
232
233The PCI 3.0 specification has implementation notes that MMIO address
234space for a device's MSI-X structure should be isolated so that the
235software system can set different pages for controlling accesses to the
236MSI-X structure. The implementation of MSI support requires the PCI
237subsystem, not a device driver, to maintain full control of the MSI-X
238table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
239table/MSI-X PBA. A device driver should not access the MMIO address
240space of the MSI-X table/MSI-X PBA.
241
2425.3.2 API pci_enable_msix
243 30
244int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
245 31
246This API enables a device driver to request the PCI subsystem 323. Why use MSIs?
247to enable MSI-X messages on its hardware device. Depending on 33
248the availability of PCI vectors resources, the PCI subsystem enables 34There are three reasons why using MSIs can give an advantage over
249either all or none of the requested vectors. 35traditional pin-based interrupts.
36
37Pin-based PCI interrupts are often shared amongst several devices.
38To support this, the kernel must call each interrupt handler associated
39with an interrupt, which leads to reduced performance for the system as
40a whole. MSIs are never shared, so this problem cannot arise.
41
42When a device writes data to memory, then raises a pin-based interrupt,
43it is possible that the interrupt may arrive before all the data has
44arrived in memory (this becomes more likely with devices behind PCI-PCI
45bridges). In order to ensure that all the data has arrived in memory,
46the interrupt handler must read a register on the device which raised
47the interrupt. PCI transaction ordering rules require that all the data
48arrives in memory before the value can be returned from the register.
49Using MSIs avoids this problem as the interrupt-generating write cannot
50pass the data writes, so by the time the interrupt is raised, the driver
51knows that all the data has arrived in memory.
52
53PCI devices can only support a single pin-based interrupt per function.
54Often drivers have to query the device to find out what event has
55occurred, slowing down interrupt handling for the common case. With
56MSIs, a device can support more interrupts, allowing each interrupt
57to be specialised to a different purpose. One possible design gives
58infrequent conditions (such as errors) their own interrupt which allows
59the driver to handle the normal interrupt handling path more efficiently.
60Other possible designs include giving one interrupt to each packet queue
61in a network card or each port in a storage controller.
62
63
644. How to use MSIs
65
66PCI devices are initialised to use pin-based interrupts. The device
67driver has to set up the device to use MSI or MSI-X. Not all machines
68support MSIs correctly, and for those machines, the APIs described below
69will simply fail and the device will continue to use pin-based interrupts.
70
714.1 Include kernel support for MSIs
72
73To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
74option enabled. This option is only available on some architectures,
75and it may depend on some other options also being set. For example,
76on x86, you must also enable X86_UP_APIC or SMP in order to see the
77CONFIG_PCI_MSI option.
78
794.2 Using MSI
80
81Most of the hard work is done for the driver in the PCI layer. It simply
82has to request that the PCI layer set up the MSI capability for this
83device.
84
854.2.1 pci_enable_msi
86
87int pci_enable_msi(struct pci_dev *dev)
88
89A successful call will allocate ONE interrupt to the device, regardless
90of how many MSIs the device supports. The device will be switched from
91pin-based interrupt mode to MSI mode. The dev->irq number is changed
92to a new number which represents the message signaled interrupt.
93This function should be called before the driver calls request_irq()
94since enabling MSIs disables the pin-based IRQ and the driver will not
95receive interrupts on the old interrupt.
96
974.2.2 pci_enable_msi_block
98
99int pci_enable_msi_block(struct pci_dev *dev, int count)
100
101This variation on the above call allows a device driver to request multiple
102MSIs. The MSI specification only allows interrupts to be allocated in
103powers of two, up to a maximum of 2^5 (32).
104
105If this function returns 0, it has succeeded in allocating at least as many
106interrupts as the driver requested (it may have allocated more in order
107to satisfy the power-of-two requirement). In this case, the function
108enables MSI on this device and updates dev->irq to be the lowest of
109the new interrupts assigned to it. The other interrupts assigned to
110the device are in the range dev->irq to dev->irq + count - 1.
111
112If this function returns a negative number, it indicates an error and
113the driver should not attempt to request any more MSI interrupts for
114this device. If this function returns a positive number, it will be
115less than 'count' and indicate the number of interrupts that could have
116been allocated. In neither case will the irq value have been
117updated, nor will the device have been switched into MSI mode.
118
119The device driver must decide what action to take if
120pci_enable_msi_block() returns a value less than the number asked for.
121Some devices can make use of fewer interrupts than the maximum they
122request; in this case the driver should call pci_enable_msi_block()
123again. Note that it is not guaranteed to succeed, even when the
124'count' has been reduced to the value returned from a previous call to
125pci_enable_msi_block(). This is because there are multiple constraints
126on the number of vectors that can be allocated; pci_enable_msi_block()
127will return as soon as it finds any constraint that doesn't allow the
128call to succeed.
129
1304.2.3 pci_disable_msi
131
132void pci_disable_msi(struct pci_dev *dev)
250 133
251Argument 'dev' points to the device (pci_dev) structure. 134This function should be used to undo the effect of pci_enable_msi() or
135pci_enable_msi_block(). Calling it restores dev->irq to the pin-based
136interrupt number and frees the previously allocated message signaled
137interrupt(s). The interrupt may subsequently be assigned to another
138device, so drivers should not cache the value of dev->irq.
252 139
253Argument 'entries' is a pointer to an array of msix_entry structs. 140A device driver must always call free_irq() on the interrupt(s)
254The number of entries is indicated in argument 'nvec'. 141for which it has called request_irq() before calling this function.
255struct msix_entry is defined in /driver/pci/msi.h: 142Failure to do so will result in a BUG_ON(), the device will be left with
143MSI enabled and will leak its vector.
144
1454.3 Using MSI-X
146
147The MSI-X capability is much more flexible than the MSI capability.
148It supports up to 2048 interrupts, each of which can be controlled
149independently. To support this flexibility, drivers must use an array of
150`struct msix_entry':
256 151
257struct msix_entry { 152struct msix_entry {
258 u16 vector; /* kernel uses to write alloc vector */ 153 u16 vector; /* kernel uses to write alloc vector */
259 u16 entry; /* driver uses to specify entry */ 154 u16 entry; /* driver uses to specify entry */
260}; 155};
261 156
262A device driver is responsible for initializing the field 'entry' of 157This allows for the device to use these interrupts in a sparse fashion;
263each element with a unique entry supported by MSI-X table. Otherwise, 158for example it could use interrupts 3 and 1027 and allocate only a
264-EINVAL will be returned as a result. A successful return of zero 159two-element array. The driver is expected to fill in the 'entry' value
265indicates the PCI subsystem completed initializing each of the requested 160in each element of the array to indicate which entries it wants the kernel
266entries of the MSI-X table with message address and message data. 161to assign interrupts for. It is invalid to fill in two entries with the
267Last but not least, the PCI subsystem will write the 1:1 162same number.
268vector-to-entry mapping into the field 'vector' of each element. A 163
269device driver is responsible for keeping track of allocated MSI-X 1644.3.1 pci_enable_msix
270vectors in its internal data structure. 165
271 166int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
272A return of zero indicates that the number of MSI-X vectors was 167
273successfully allocated. A return of greater than zero indicates 168Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
274MSI-X vector shortage. Or a return of less than zero indicates 169The 'entries' argument is a pointer to an array of msix_entry structs
275a failure. This failure may be a result of duplicate entries 170which should be at least 'nvec' entries in size. On success, the
276specified in second argument, or a result of no available vector, 171function will return 0 and the device will have been switched into
277or a result of failing to initialize MSI-X table entries. 172MSI-X interrupt mode. The 'vector' elements in each entry will have
278 173been filled in with the interrupt number. The driver should then call
2795.3.3 API pci_disable_msix 174request_irq() for each 'vector' that it decides to use.
175
176If this function returns a negative number, it indicates an error and
177the driver should not attempt to allocate any more MSI-X interrupts for
178this device. If it returns a positive number, it indicates the maximum
179number of interrupt vectors that could have been allocated. See example
180below.
181
182This function, in contrast with pci_enable_msi(), does not adjust
183dev->irq. The device will not generate interrupts for this interrupt
184number once MSI-X is enabled. The device driver is responsible for
185keeping track of the interrupts assigned to the MSI-X vectors so it can
186free them again later.
187
188Device drivers should normally call this function once per device
189during the initialization phase.
190
191It is ideal if drivers can cope with a variable number of MSI-X interrupts,
192there are many reasons why the platform may not be able to provide the
193exact number a driver asks for.
194
195A request loop to achieve that might look like:
196
197static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
198{
199 while (nvec >= FOO_DRIVER_MINIMUM_NVEC) {
200 rc = pci_enable_msix(adapter->pdev,
201 adapter->msix_entries, nvec);
202 if (rc > 0)
203 nvec = rc;
204 else
205 return rc;
206 }
207
208 return -ENOSPC;
209}
210
2114.3.2 pci_disable_msix
280 212
281void pci_disable_msix(struct pci_dev *dev) 213void pci_disable_msix(struct pci_dev *dev)
282 214
283This API should always be used to undo the effect of pci_enable_msix() 215This API should be used to undo the effect of pci_enable_msix(). It frees
284when a device driver is unloading. Note that a device driver should 216the previously allocated message signaled interrupts. The interrupts may
285always call free_irq() on all MSI-X vectors it has done request_irq() 217subsequently be assigned to another device, so drivers should not cache
286on before calling this API. Failure to do so results in a BUG_ON() and 218the value of the 'vector' elements over a call to pci_disable_msix().
287a device will be left with MSI-X enabled and leaks its vectors. 219
288 220A device driver must always call free_irq() on the interrupt(s)
2895.3.4 MSI-X mode vs. legacy mode diagram 221for which it has called request_irq() before calling this function.
290 222Failure to do so will result in a BUG_ON(), the device will be left with
291The below diagram shows the events which switch the interrupt 223MSI enabled and will leak its vector.
292mode on the MSI-X capable device function between MSI-X mode and 224
293PIN-IRQ assertion mode (legacy). 2254.3.3 The MSI-X Table
294 226
295 ------------ pci_enable_msix(,,n) ------------------------ 227The MSI-X capability specifies a BAR and offset within that BAR for the
296 | | <=============== | | 228MSI-X Table. This address is mapped by the PCI subsystem, and should not
297 | MSI-X MODE | | PIN-IRQ ASSERTION MODE | 229be accessed directly by the device driver. If the driver wishes to
298 | | ===============> | | 230mask or unmask an interrupt, it should call disable_irq() / enable_irq().
299 ------------ pci_disable_msix ------------------------ 231
300 2324.4 Handling devices implementing both MSI and MSI-X capabilities
301Figure 2. MSI-X Mode vs. Legacy Mode 233
302 234If a device implements both MSI and MSI-X capabilities, it can
303In Figure 2, a device operates by default in legacy mode. A 235run in either MSI mode or MSI-X mode but not both simultaneously.
304successful MSI-X request (using pci_enable_msix()) switches a 236This is a requirement of the PCI spec, and it is enforced by the
305device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector 237PCI layer. Calling pci_enable_msi() when MSI-X is already enabled or
306stored in dev->irq will be saved by the PCI subsystem; however, 238pci_enable_msix() when MSI is already enabled will result in an error.
307unlike MSI mode, the PCI subsystem will not replace dev->irq with 239If a device driver wishes to switch between MSI and MSI-X at runtime,
308assigned MSI-X vector because the PCI subsystem already writes the 1:1 240it must first quiesce the device, then switch it back to pin-interrupt
309vector-to-entry mapping into the field 'vector' of each element 241mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
310specified in second argument. 242operation. This is not expected to be a common operation but may be
311 243useful for debugging or testing during development.
312To return back to its default mode, a device driver should always call 244
313pci_disable_msix() to undo the effect of pci_enable_msix(). Note that 2454.5 Considerations when using MSIs
314a device driver should always call free_irq() on all MSI-X vectors it 246
315has done request_irq() on before calling pci_disable_msix(). Failure 2474.5.1 Choosing between MSI-X and MSI
316to do so results in a BUG_ON() and a device will be left with MSI-X 248
317enabled and leaks its vectors. Otherwise, the PCI subsystem switches a 249If your device supports both MSI-X and MSI capabilities, you should use
318device function's interrupt mode from MSI-X mode to legacy mode and 250the MSI-X facilities in preference to the MSI facilities. As mentioned
319marks all allocated MSI-X vectors as unused. 251above, MSI-X supports any number of interrupts between 1 and 2048.
320 252In constrast, MSI is restricted to a maximum of 32 interrupts (and
321Once being marked as unused, there is no guarantee that the PCI 253must be a power of two). In addition, the MSI interrupt vectors must
322subsystem will reserve these MSI-X vectors for a device. Depending on 254be allocated consecutively, so the system may not be able to allocate
323the availability of current PCI vector resources and the number of 255as many vectors for MSI as it could for MSI-X. On some platforms, MSI
324MSI/MSI-X requests from other drivers, these MSI-X vectors may be 256interrupts must all be targetted at the same set of CPUs whereas MSI-X
325re-assigned. 257interrupts can all be targetted at different CPUs.
326 258
327For the case where the PCI subsystem re-assigned these MSI-X vectors 2594.5.2 Spinlocks
328to other drivers, a request to switch back to MSI-X mode may result 260
329being assigned with another set of MSI-X vectors or a failure if no 261Most device drivers have a per-device spinlock which is taken in the
330more vectors are available. 262interrupt handler. With pin-based interrupts or a single MSI, it is not
331 263necessary to disable interrupts (Linux guarantees the same interrupt will
3325.4 Handling function implementing both MSI and MSI-X capabilities 264not be re-entered). If a device uses multiple interrupts, the driver
333 265must disable interrupts while the lock is held. If the device sends
334For the case where a function implements both MSI and MSI-X 266a different interrupt, the driver will deadlock trying to recursively
335capabilities, the PCI subsystem enables a device to run either in MSI 267acquire the spinlock.
336mode or MSI-X mode but not both. A device driver determines whether it 268
337wants MSI or MSI-X enabled on its hardware device. Once a device 269There are two solutions. The first is to take the lock with
338driver requests for MSI, for example, it is prohibited from requesting 270spin_lock_irqsave() or spin_lock_irq() (see
339MSI-X; in other words, a device driver is not permitted to ping-pong 271Documentation/DocBook/kernel-locking). The second is to specify
340between MSI mod MSI-X mode during a run-time. 272IRQF_DISABLED to request_irq() so that the kernel runs the entire
341 273interrupt routine with interrupts disabled.
3425.5 Hardware requirements for MSI/MSI-X support 274
343 275If your MSI interrupt routine does not hold the lock for the whole time
344MSI/MSI-X support requires support from both system hardware and 276it is running, the first solution may be best. The second solution is
345individual hardware device functions. 277normally preferred as it avoids making two transitions from interrupt
346 278disabled to enabled and back again.
3475.5.1 Required x86 hardware support 279
348 2804.6 How to tell whether MSI/MSI-X is enabled on a device
349Since the target of MSI address is the local APIC CPU, enabling 281
350MSI/MSI-X support in the Linux kernel is dependent on whether existing 282Using 'lspci -v' (as root) may show some devices with "MSI", "Message
351system hardware supports local APIC. Users should verify that their 283Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
352system supports local APIC operation by testing that it runs when 284has an 'Enable' flag which will be followed with either "+" (enabled)
353CONFIG_X86_LOCAL_APIC=y. 285or "-" (disabled).
354 286
355In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; 287
356however, in UP environment, users must manually set 2885. MSI quirks
357CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting 289
358CONFIG_PCI_MSI enables the VECTOR based scheme and the option for 290Several PCI chipsets or devices are known not to support MSIs.
359MSI-capable device drivers to selectively enable MSI/MSI-X. 291The PCI stack provides three ways to disable MSIs:
360 292
361Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X 2931. globally
362vector is allocated new during runtime and MSI/MSI-X support does not 2942. on all devices behind a specific bridge
363depend on BIOS support. This key independency enables MSI/MSI-X 2953. on a single device
364support on future IOxAPIC free platforms. 296
365 2975.1. Disabling MSIs globally
3665.5.2 Device hardware support 298
367 299Some host chipsets simply don't support MSIs properly. If we're
368The hardware device function supports MSI by indicating the 300lucky, the manufacturer knows this and has indicated it in the ACPI
369MSI/MSI-X capability structure on its PCI capability list. By 301FADT table. In this case, Linux will automatically disable MSIs.
370default, this capability structure will not be initialized by 302Some boards don't include this information in the table and so we have
371the kernel to enable MSI during the system boot. In other words, 303to detect them ourselves. The complete list of these is found near the
372the device function is running on its default pin assertion mode. 304quirk_disable_all_msi() function in drivers/pci/quirks.c.
373Note that in many cases the hardware supporting MSI have bugs, 305
374which may result in system hangs. The software driver of specific 306If you have a board which has problems with MSIs, you can pass pci=nomsi
375MSI-capable hardware is responsible for deciding whether to call 307on the kernel command line to disable MSIs on all devices. It would be
376pci_enable_msi or not. A return of zero indicates the kernel 308in your best interests to report the problem to linux-pci@vger.kernel.org
377successfully initialized the MSI/MSI-X capability structure of the 309including a full 'lspci -v' so we can add the quirks to the kernel.
378device function. The device function is now running on MSI/MSI-X mode. 310
379 3115.2. Disabling MSIs below a bridge
3805.6 How to tell whether MSI/MSI-X is enabled on device function 312
381 313Some PCI bridges are not able to route MSIs between busses properly.
382At the driver level, a return of zero from the function call of 314In this case, MSIs must be disabled on all devices behind the bridge.
383pci_enable_msi()/pci_enable_msix() indicates to a device driver that 315
384its device function is initialized successfully and ready to run in 316Some bridges allow you to enable MSIs by changing some bits in their
385MSI/MSI-X mode. 317PCI configuration space (especially the Hypertransport chipsets such
386 318as the nVidia nForce and Serverworks HT2000). As with host chipsets,
387At the user level, users can use the command 'cat /proc/interrupts' 319Linux mostly knows about them and automatically enables MSIs if it can.
388to display the vectors allocated for devices and their interrupt 320If you have a bridge which Linux doesn't yet know about, you can enable
389MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is 321MSIs in configuration space using whatever method you know works, then
390enabled on a SCSI Adaptec 39320D Ultra320 controller. 322enable MSIs on that bridge by doing:
391 323
392 CPU0 CPU1 324 echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
393 0: 324639 0 IO-APIC-edge timer 325
394 1: 1186 0 IO-APIC-edge i8042 326where $bridge is the PCI address of the bridge you've enabled (eg
395 2: 0 0 XT-PIC cascade 3270000:00:0e.0).
396 12: 2797 0 IO-APIC-edge i8042 328
397 14: 6543 0 IO-APIC-edge ide0 329To disable MSIs, echo 0 instead of 1. Changing this value should be
398 15: 1 0 IO-APIC-edge ide1 330done with caution as it can break interrupt handling for all devices
399169: 0 0 IO-APIC-level uhci-hcd 331below this bridge.
400185: 0 0 IO-APIC-level uhci-hcd 332
401193: 138 10 PCI-MSI aic79xx 333Again, please notify linux-pci@vger.kernel.org of any bridges that need
402201: 30 0 PCI-MSI aic79xx 334special handling.
403225: 30 0 IO-APIC-level aic7xxx 335
404233: 30 0 IO-APIC-level aic7xxx 3365.3. Disabling MSIs on a single device
405NMI: 0 0 337
406LOC: 324553 325068 338Some devices are known to have faulty MSI implementations. Usually this
407ERR: 0 339is handled in the individual device driver but occasionally it's necessary
408MIS: 0 340to handle this with a quirk. Some drivers have an option to disable use
409 341of MSI. While this is a convenient workaround for the driver author,
4106. MSI quirks 342it is not good practise, and should not be emulated.
411 343
412Several PCI chipsets or devices are known to not support MSI. 3445.4. Finding why MSIs are disabled on a device
413The PCI stack provides 3 possible levels of MSI disabling: 345
414* on a single device 346From the above three sections, you can see that there are many reasons
415* on all devices behind a specific bridge 347why MSIs may not be enabled for a given device. Your first step should
416* globally 348be to examine your dmesg carefully to determine whether MSIs are enabled
417 349for your machine. You should also check your .config to be sure you
4186.1. Disabling MSI on a single device 350have enabled CONFIG_PCI_MSI.
419 351
420Under some circumstances it might be required to disable MSI on a 352Then, 'lspci -t' gives the list of bridges above a device. Reading
421single device. This may be achieved by either not calling pci_enable_msi() 353/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1)
422or all, or setting the pci_dev->no_msi flag before (most of the time 354or disabled (0). If 0 is found in any of the msi_bus files belonging
423in a quirk). 355to bridges between the PCI root and the device, MSIs are disabled.
424 356
4256.2. Disabling MSI below a bridge 357It is also worth checking the device driver to see whether it supports MSIs.
426 358For example, it may contain calls to pci_enable_msi(), pci_enable_msix() or
427The vast majority of MSI quirks are required by PCI bridges not 359pci_enable_msi_block().
428being able to route MSI between busses. In this case, MSI have to be
429disabled on all devices behind this bridge. It is achieves by setting
430the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge
431subordinate bus. There is no need to set the same flag on bridges that
432are below the broken bridge. When pci_enable_msi() is called to enable
433MSI on a device, pci_msi_supported() takes care of checking the NO_MSI
434flag in all parent busses of the device.
435
436Some bridges actually support dynamic MSI support enabling/disabling
437by changing some bits in their PCI configuration space (especially
438the Hypertransport chipsets such as the nVidia nForce and Serverworks
439HT2000). It may then be required to update the NO_MSI flag on the
440corresponding devices in the sysfs hierarchy. To enable MSI support
441on device "0000:00:0e", do:
442
443 echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus
444
445To disable MSI support, echo 0 instead of 1. Note that it should be
446used with caution since changing this value might break interrupts.
447
4486.3. Disabling MSI globally
449
450Some extreme cases may require to disable MSI globally on the system.
451For now, the only known case is a Serverworks PCI-X chipsets (MSI are
452not supported on several busses that are not all connected to the
453chipset in the Linux PCI hierarchy). In the vast majority of other
454cases, disabling only behind a specific bridge is enough.
455
456For debugging purpose, the user may also pass pci=nomsi on the kernel
457command-line to explicitly disable MSI globally. But, once the appro-
458priate quirks are added to the kernel, this option should not be
459required anymore.
460
4616.4. Finding why MSI cannot be enabled on a device
462
463Assuming that MSI are not enabled on a device, you should look at
464dmesg to find messages that quirks may output when disabling MSI
465on some devices, some bridges or even globally.
466Then, lspci -t gives the list of bridges above a device. Reading
467/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI
468are enabled (1) or disabled (0). In 0 is found in a single bridge
469msi_bus file above the device, MSI cannot be enabled.
470
4717. FAQ
472
473Q1. Are there any limitations on using the MSI?
474
475A1. If the PCI device supports MSI and conforms to the
476specification and the platform supports the APIC local bus,
477then using MSI should work.
478
479Q2. Will it work on all the Pentium processors (P3, P4, Xeon,
480AMD processors)? In P3 IPI's are transmitted on the APIC local
481bus and in P4 and Xeon they are transmitted on the system
482bus. Are there any implications with this?
483
484A2. MSI support enables a PCI device sending an inbound
485memory write (0xfeexxxxx as target address) on its PCI bus
486directly to the FSB. Since the message address has a
487redirection hint bit cleared, it should work.
488
489Q3. The target address 0xfeexxxxx will be translated by the
490Host Bridge into an interrupt message. Are there any
491limitations on the chipsets such as Intel 8xx, Intel e7xxx,
492or VIA?
493
494A3. If these chipsets support an inbound memory write with
495target address set as 0xfeexxxxx, as conformed to PCI
496specification 2.3 or latest, then it should work.
497
498Q4. From the driver point of view, if the MSI is lost because
499of errors occurring during inbound memory write, then it may
500wait forever. Is there a mechanism for it to recover?
501
502A4. Since the target of the transaction is an inbound memory
503write, all transaction termination conditions (Retry,
504Master-Abort, Target-Abort, or normal completion) are
505supported. A device sending an MSI must abide by all the PCI
506rules and conditions regarding that inbound memory write. So,
507if a retry is signaled it must retry, etc... We believe that
508the recommendation for Abort is also a retry (refer to PCI
509specification 2.3 or latest).
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
new file mode 100644
index 000000000000..fc73ef5d65b8
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.txt
@@ -0,0 +1,99 @@
1 PCI Express I/O Virtualization Howto
2 Copyright (C) 2009 Intel Corporation
3 Yu Zhao <yu.zhao@intel.com>
4
5
61. Overview
7
81.1 What is SR-IOV
9
10Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
11capability which makes one physical device appear as multiple virtual
12devices. The physical device is referred to as Physical Function (PF)
13while the virtual devices are referred to as Virtual Functions (VF).
14Allocation of the VF can be dynamically controlled by the PF via
15registers encapsulated in the capability. By default, this feature is
16not enabled and the PF behaves as traditional PCIe device. Once it's
17turned on, each VF's PCI configuration space can be accessed by its own
18Bus, Device and Function Number (Routing ID). And each VF also has PCI
19Memory Space, which is used to map its register set. VF device driver
20operates on the register set so it can be functional and appear as a
21real existing PCI device.
22
232. User Guide
24
252.1 How can I enable SR-IOV capability
26
27The device driver (PF driver) will control the enabling and disabling
28of the capability via API provided by SR-IOV core. If the hardware
29has SR-IOV capability, loading its PF driver would enable it and all
30VFs associated with the PF.
31
322.2 How can I use the Virtual Functions
33
34The VF is treated as hot-plugged PCI devices in the kernel, so they
35should be able to work in the same way as real PCI devices. The VF
36requires device driver that is same as a normal PCI device's.
37
383. Developer Guide
39
403.1 SR-IOV API
41
42To enable SR-IOV capability:
43 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
44 'nr_virtfn' is number of VFs to be enabled.
45
46To disable SR-IOV capability:
47 void pci_disable_sriov(struct pci_dev *dev);
48
49To notify SR-IOV core of Virtual Function Migration:
50 irqreturn_t pci_sriov_migration(struct pci_dev *dev);
51
523.2 Usage example
53
54Following piece of code illustrates the usage of the SR-IOV API.
55
56static int __devinit dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
57{
58 pci_enable_sriov(dev, NR_VIRTFN);
59
60 ...
61
62 return 0;
63}
64
65static void __devexit dev_remove(struct pci_dev *dev)
66{
67 pci_disable_sriov(dev);
68
69 ...
70}
71
72static int dev_suspend(struct pci_dev *dev, pm_message_t state)
73{
74 ...
75
76 return 0;
77}
78
79static int dev_resume(struct pci_dev *dev)
80{
81 ...
82
83 return 0;
84}
85
86static void dev_shutdown(struct pci_dev *dev)
87{
88 ...
89}
90
91static struct pci_driver dev_driver = {
92 .name = "SR-IOV Physical Function driver",
93 .id_table = dev_id_table,
94 .probe = dev_probe,
95 .remove = __devexit_p(dev_remove),
96 .suspend = dev_suspend,
97 .resume = dev_resume,
98 .shutdown = dev_shutdown,
99};
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 1fd175368a87..4349c1487e91 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -118,7 +118,7 @@ Following are the RCU equivalents for these two functions:
118 list_for_each_entry(e, list, list) { 118 list_for_each_entry(e, list, list) {
119 if (!audit_compare_rule(rule, &e->rule)) { 119 if (!audit_compare_rule(rule, &e->rule)) {
120 list_del_rcu(&e->list); 120 list_del_rcu(&e->list);
121 call_rcu(&e->rcu, audit_free_rule, e); 121 call_rcu(&e->rcu, audit_free_rule);
122 return 0; 122 return 0;
123 } 123 }
124 } 124 }
@@ -206,7 +206,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
206 ne->rule.action = newaction; 206 ne->rule.action = newaction;
207 ne->rule.file_count = newfield_count; 207 ne->rule.file_count = newfield_count;
208 list_replace_rcu(e, ne); 208 list_replace_rcu(e, ne);
209 call_rcu(&e->rcu, audit_free_rule, e); 209 call_rcu(&e->rcu, audit_free_rule);
210 return 0; 210 return 0;
211 } 211 }
212 } 212 }
@@ -283,7 +283,7 @@ flag under the spinlock as follows:
283 list_del_rcu(&e->list); 283 list_del_rcu(&e->list);
284 e->deleted = 1; 284 e->deleted = 1;
285 spin_unlock(&e->lock); 285 spin_unlock(&e->lock);
286 call_rcu(&e->rcu, audit_free_rule, e); 286 call_rcu(&e->rcu, audit_free_rule);
287 return 0; 287 return 0;
288 } 288 }
289 } 289 }
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 95821a29ae41..7aa2002ade77 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -81,7 +81,7 @@ o I hear that RCU needs work in order to support realtime kernels?
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
83 However, work is in progress for enabling priority boosting of 83 However, work is in progress for enabling priority boosting of
84 preempted RCU read-side critical sections.This is needed if you 84 preempted RCU read-side critical sections. This is needed if you
85 have CPU-bound realtime threads. 85 have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 239f542d48ba..6389dec33459 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -21,7 +21,7 @@ if (obj) {
21 /* 21 /*
22 * Because a writer could delete object, and a writer could 22 * Because a writer could delete object, and a writer could
23 * reuse these object before the RCU grace period, we 23 * reuse these object before the RCU grace period, we
24 * must check key after geting the reference on object 24 * must check key after getting the reference on object
25 */ 25 */
26 if (obj->key != key) { // not the object we expected 26 if (obj->key != key) { // not the object we expected
27 put_ref(obj); 27 put_ref(obj);
@@ -117,7 +117,7 @@ a race (some writer did a delete and/or a move of an object
117to another chain) checking the final 'nulls' value if 117to another chain) checking the final 'nulls' value if
118the lookup met the end of chain. If final 'nulls' value 118the lookup met the end of chain. If final 'nulls' value
119is not the slot number, then we must restart the lookup at 119is not the slot number, then we must restart the lookup at
120the begining. If the object was moved to same chain, 120the beginning. If the object was moved to the same chain,
121then the reader doesnt care : It might eventually 121then the reader doesnt care : It might eventually
122scan the list again without harm. 122scan the list again without harm.
123 123
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 989c2fcd8111..629c92e99783 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,14 +184,16 @@ length. Single character labels using special characters, that being anything
184other than a letter or digit, are reserved for use by the Smack development 184other than a letter or digit, are reserved for use by the Smack development
185team. Smack labels are unstructured, case sensitive, and the only operation 185team. Smack labels are unstructured, case sensitive, and the only operation
186ever performed on them is comparison for equality. Smack labels cannot 186ever performed on them is comparison for equality. Smack labels cannot
187contain unprintable characters or the "/" (slash) character. 187contain unprintable characters or the "/" (slash) character. Smack labels
188cannot begin with a '-', which is reserved for special options.
188 189
189There are some predefined labels: 190There are some predefined labels:
190 191
191 _ Pronounced "floor", a single underscore character. 192 _ Pronounced "floor", a single underscore character.
192 ^ Pronounced "hat", a single circumflex character. 193 ^ Pronounced "hat", a single circumflex character.
193 * Pronounced "star", a single asterisk character. 194 * Pronounced "star", a single asterisk character.
194 ? Pronounced "huh", a single question mark character. 195 ? Pronounced "huh", a single question mark character.
196 @ Pronounced "Internet", a single at sign character.
195 197
196Every task on a Smack system is assigned a label. System tasks, such as 198Every task on a Smack system is assigned a label. System tasks, such as
197init(8) and systems daemons, are run with the floor ("_") label. User tasks 199init(8) and systems daemons, are run with the floor ("_") label. User tasks
@@ -412,6 +414,36 @@ sockets.
412 A privileged program may set this to match the label of another 414 A privileged program may set this to match the label of another
413 task with which it hopes to communicate. 415 task with which it hopes to communicate.
414 416
417Smack Netlabel Exceptions
418
419You will often find that your labeled application has to talk to the outside,
420unlabeled world. To do this there's a special file /smack/netlabel where you can
421add some exceptions in the form of :
422@IP1 LABEL1 or
423@IP2/MASK LABEL2
424
425It means that your application will have unlabeled access to @IP1 if it has
426write access on LABEL1, and access to the subnet @IP2/MASK if it has write
427access on LABEL2.
428
429Entries in the /smack/netlabel file are matched by longest mask first, like in
430classless IPv4 routing.
431
432A special label '@' and an option '-CIPSO' can be used there :
433@ means Internet, any application with any label has access to it
434-CIPSO means standard CIPSO networking
435
436If you don't know what CIPSO is and don't plan to use it, you can just do :
437echo 127.0.0.1 -CIPSO > /smack/netlabel
438echo 0.0.0.0/0 @ > /smack/netlabel
439
440If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled
441Internet access, you can have :
442echo 127.0.0.1 -CIPSO > /smack/netlabel
443echo 192.168.0.0/16 -CIPSO > /smack/netlabel
444echo 0.0.0.0/0 @ > /smack/netlabel
445
446
415Writing Applications for Smack 447Writing Applications for Smack
416 448
417There are three sorts of applications that will run on a Smack system. How an 449There are three sorts of applications that will run on a Smack system. How an
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
new file mode 100644
index 000000000000..3f58fa3d6d00
--- /dev/null
+++ b/Documentation/cgroups/00-INDEX
@@ -0,0 +1,18 @@
100-INDEX
2 - this file
3cgroups.txt
4 - Control Groups definition, implementation details, examples and API.
5cpuacct.txt
6 - CPU Accounting Controller; account CPU usage for groups of tasks.
7cpusets.txt
8 - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
9devices.txt
10 - Device Whitelist Controller; description, interface and security.
11freezer-subsystem.txt
12 - checkpointing; rationale to not use signals, interface.
13memcg_test.txt
14 - Memory Resource Controller; implementation details.
15memory.txt
16 - Memory Resource Controller; design, accounting, interface, testing.
17resource_counter.txt
18 - Resource Counter API.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..6eb1a97e88ce 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -56,7 +56,7 @@ hierarchy, and a set of subsystems; each subsystem has system-specific
56state attached to each cgroup in the hierarchy. Each hierarchy has 56state attached to each cgroup in the hierarchy. Each hierarchy has
57an instance of the cgroup virtual filesystem associated with it. 57an instance of the cgroup virtual filesystem associated with it.
58 58
59At any one time there may be multiple active hierachies of task 59At any one time there may be multiple active hierarchies of task
60cgroups. Each hierarchy is a partition of all tasks in the system. 60cgroups. Each hierarchy is a partition of all tasks in the system.
61 61
62User level code may create and destroy cgroups by name in an 62User level code may create and destroy cgroups by name in an
@@ -124,10 +124,10 @@ following lines:
124 / \ 124 / \
125 Prof (15%) students (5%) 125 Prof (15%) students (5%)
126 126
127Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go 127Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd go
128into NFS network class. 128into NFS network class.
129 129
130At the same time firefox/lynx will share an appropriate CPU/Memory class 130At the same time Firefox/Lynx will share an appropriate CPU/Memory class
131depending on who launched it (prof/student). 131depending on who launched it (prof/student).
132 132
133With the ability to classify tasks differently for different resources 133With the ability to classify tasks differently for different resources
@@ -325,7 +325,7 @@ and then start a subshell 'sh' in that cgroup:
325Creating, modifying, using the cgroups can be done through the cgroup 325Creating, modifying, using the cgroups can be done through the cgroup
326virtual filesystem. 326virtual filesystem.
327 327
328To mount a cgroup hierarchy will all available subsystems, type: 328To mount a cgroup hierarchy with all available subsystems, type:
329# mount -t cgroup xxx /dev/cgroup 329# mount -t cgroup xxx /dev/cgroup
330 330
331The "xxx" is not interpreted by the cgroup code, but will appear in 331The "xxx" is not interpreted by the cgroup code, but will appear in
@@ -333,12 +333,23 @@ The "xxx" is not interpreted by the cgroup code, but will appear in
333 333
334To mount a cgroup hierarchy with just the cpuset and numtasks 334To mount a cgroup hierarchy with just the cpuset and numtasks
335subsystems, type: 335subsystems, type:
336# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup 336# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup
337 337
338To change the set of subsystems bound to a mounted hierarchy, just 338To change the set of subsystems bound to a mounted hierarchy, just
339remount with different options: 339remount with different options:
340# mount -o remount,cpuset,ns hier1 /dev/cgroup
340 341
341# mount -o remount,cpuset,ns /dev/cgroup 342Now memory is removed from the hierarchy and ns is added.
343
344Note this will add ns to the hierarchy but won't remove memory or
345cpuset, because the new options are appended to the old ones:
346# mount -o remount,ns /dev/cgroup
347
348To Specify a hierarchy's release_agent:
349# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
350 xxx /dev/cgroup
351
352Note that specifying 'release_agent' more than once will return failure.
342 353
343Note that changing the set of subsystems is currently only supported 354Note that changing the set of subsystems is currently only supported
344when the hierarchy consists of a single (root) cgroup. Supporting 355when the hierarchy consists of a single (root) cgroup. Supporting
@@ -349,6 +360,11 @@ Then under /dev/cgroup you can find a tree that corresponds to the
349tree of the cgroups in the system. For instance, /dev/cgroup 360tree of the cgroups in the system. For instance, /dev/cgroup
350is the cgroup that holds the whole system. 361is the cgroup that holds the whole system.
351 362
363If you want to change the value of release_agent:
364# echo "/sbin/new_release_agent" > /dev/cgroup/release_agent
365
366It can also be changed via remount.
367
352If you want to create a new cgroup under /dev/cgroup: 368If you want to create a new cgroup under /dev/cgroup:
353# cd /dev/cgroup 369# cd /dev/cgroup
354# mkdir my_cgroup 370# mkdir my_cgroup
@@ -476,11 +492,13 @@ cgroup->parent is still valid. (Note - can also be called for a
476newly-created cgroup if an error occurs after this subsystem's 492newly-created cgroup if an error occurs after this subsystem's
477create() method has been called for the new cgroup). 493create() method has been called for the new cgroup).
478 494
479void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); 495int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
480 496
481Called before checking the reference count on each subsystem. This may 497Called before checking the reference count on each subsystem. This may
482be useful for subsystems which have some extra references even if 498be useful for subsystems which have some extra references even if
483there are not tasks in the cgroup. 499there are not tasks in the cgroup. If pre_destroy() returns error code,
500rmdir() will fail with it. From this behavior, pre_destroy() can be
501called multiple times against a cgroup.
484 502
485int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 503int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
486 struct task_struct *task) 504 struct task_struct *task)
@@ -521,7 +539,7 @@ always handled well.
521void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) 539void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
522(cgroup_mutex held by caller) 540(cgroup_mutex held by caller)
523 541
524Called at the end of cgroup_clone() to do any paramater 542Called at the end of cgroup_clone() to do any parameter
525initialization which might be required before a task could attach. For 543initialization which might be required before a task could attach. For
526example in cpusets, no task may attach before 'cpus' and 'mems' are set 544example in cpusets, no task may attach before 'cpus' and 'mems' are set
527up. 545up.
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 0611e9528c7c..f9ca389dddf4 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -131,7 +131,7 @@ Cpusets extends these two mechanisms as follows:
131 - The hierarchy of cpusets can be mounted at /dev/cpuset, for 131 - The hierarchy of cpusets can be mounted at /dev/cpuset, for
132 browsing and manipulation from user space. 132 browsing and manipulation from user space.
133 - A cpuset may be marked exclusive, which ensures that no other 133 - A cpuset may be marked exclusive, which ensures that no other
134 cpuset (except direct ancestors and descendents) may contain 134 cpuset (except direct ancestors and descendants) may contain
135 any overlapping CPUs or Memory Nodes. 135 any overlapping CPUs or Memory Nodes.
136 - You can list all the tasks (by pid) attached to any cpuset. 136 - You can list all the tasks (by pid) attached to any cpuset.
137 137
@@ -226,7 +226,7 @@ nodes with memory--using the cpuset_track_online_nodes() hook.
226-------------------------------- 226--------------------------------
227 227
228If a cpuset is cpu or mem exclusive, no other cpuset, other than 228If a cpuset is cpu or mem exclusive, no other cpuset, other than
229a direct ancestor or descendent, may share any of the same CPUs or 229a direct ancestor or descendant, may share any of the same CPUs or
230Memory Nodes. 230Memory Nodes.
231 231
232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", 232A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
@@ -427,7 +427,7 @@ child cpusets have this flag enabled.
427When doing this, you don't usually want to leave any unpinned tasks in 427When doing this, you don't usually want to leave any unpinned tasks in
428the top cpuset that might use non-trivial amounts of CPU, as such tasks 428the top cpuset that might use non-trivial amounts of CPU, as such tasks
429may be artificially constrained to some subset of CPUs, depending on 429may be artificially constrained to some subset of CPUs, depending on
430the particulars of this flag setting in descendent cpusets. Even if 430the particulars of this flag setting in descendant cpusets. Even if
431such a task could use spare CPU cycles in some other CPUs, the kernel 431such a task could use spare CPU cycles in some other CPUs, the kernel
432scheduler might not consider the possibility of load balancing that 432scheduler might not consider the possibility of load balancing that
433task to that underused CPU. 433task to that underused CPU.
@@ -531,9 +531,9 @@ be idle.
531 531
532Of course it takes some searching cost to find movable tasks and/or 532Of course it takes some searching cost to find movable tasks and/or
533idle CPUs, the scheduler might not search all CPUs in the domain 533idle CPUs, the scheduler might not search all CPUs in the domain
534everytime. In fact, in some architectures, the searching ranges on 534every time. In fact, in some architectures, the searching ranges on
535events are limited in the same socket or node where the CPU locates, 535events are limited in the same socket or node where the CPU locates,
536while the load balance on tick searchs all. 536while the load balance on tick searches all.
537 537
538For example, assume CPU Z is relatively far from CPU X. Even if CPU Z 538For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
539is idle while CPU X and the siblings are busy, scheduler can't migrate 539is idle while CPU X and the siblings are busy, scheduler can't migrate
@@ -601,7 +601,7 @@ its new cpuset, then the task will continue to use whatever subset
601of MPOL_BIND nodes are still allowed in the new cpuset. If the task 601of MPOL_BIND nodes are still allowed in the new cpuset. If the task
602was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed 602was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
603in the new cpuset, then the task will be essentially treated as if it 603in the new cpuset, then the task will be essentially treated as if it
604was MPOL_BIND bound to the new cpuset (even though its numa placement, 604was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
605as queried by get_mempolicy(), doesn't change). If a task is moved 605as queried by get_mempolicy(), doesn't change). If a task is moved
606from one cpuset to another, then the kernel will adjust the tasks 606from one cpuset to another, then the kernel will adjust the tasks
607memory placement, as above, the next time that the kernel attempts 607memory placement, as above, the next time that the kernel attempts
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt
index 7cc6e6a60672..57ca4c89fe5c 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroups/devices.txt
@@ -42,7 +42,7 @@ suffice, but we can decide the best way to adequately restrict
42movement as people get some experience with this. We may just want 42movement as people get some experience with this. We may just want
43to require CAP_SYS_ADMIN, which at least is a separate bit from 43to require CAP_SYS_ADMIN, which at least is a separate bit from
44CAP_MKNOD. We may want to just refuse moving to a cgroup which 44CAP_MKNOD. We may want to just refuse moving to a cgroup which
45isn't a descendent of the current one. Or we may want to use 45isn't a descendant of the current one. Or we may want to use
46CAP_MAC_ADMIN, since we really are trying to lock down root. 46CAP_MAC_ADMIN, since we really are trying to lock down root.
47 47
48CAP_SYS_ADMIN is needed to modify the whitelist or move another 48CAP_SYS_ADMIN is needed to modify the whitelist or move another
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..72db89ed0609 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/19 2Last Updated: 2009/1/20
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.29-rc2.
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -356,7 +356,25 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
356 (Shell-B) 356 (Shell-B)
357 # move all tasks in /cgroup/test to /cgroup 357 # move all tasks in /cgroup/test to /cgroup
358 # /sbin/swapoff -a 358 # /sbin/swapoff -a
359 # rmdir /test/cgroup 359 # rmdir /cgroup/test
360 # kill malloc task. 360 # kill malloc task.
361 361
362 Of course, tmpfs v.s. swapoff test should be tested, too. 362 Of course, tmpfs v.s. swapoff test should be tested, too.
363
364 9.8 OOM-Killer
365 Out-of-memory caused by memcg's limit will kill tasks under
366 the memcg. When hierarchy is used, a task under hierarchy
367 will be killed by the kernel.
368 In this case, panic_on_oom shouldn't be invoked and tasks
369 in other groups shouldn't be killed.
370
371 It's not difficult to cause OOM under memcg as following.
372 Case A) when you can swapoff
373 #swapoff -a
374 #echo 50M > /memory.limit_in_bytes
375 run 51M of malloc
376
377 Case B) when you use mem+swap limitation.
378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e1501964df1e..a98a7fe7aabb 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -302,7 +302,7 @@ will be charged as a new owner of it.
302 unevictable - # of pages cannot be reclaimed.(mlocked etc) 302 unevictable - # of pages cannot be reclaimed.(mlocked etc)
303 303
304 Below is depend on CONFIG_DEBUG_VM. 304 Below is depend on CONFIG_DEBUG_VM.
305 inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) 305 inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
306 recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 306 recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
307 recent_rotated_file - VM internal parameter. (see mm/vmscan.c) 307 recent_rotated_file - VM internal parameter. (see mm/vmscan.c)
308 recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) 308 recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 62254d4510c6..4d70df63d1d3 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -1,9 +1,9 @@
1 1
2 LINUX ALLOCATED DEVICES (2.6+ version) 2 LINUX ALLOCATED DEVICES (2.6+ version)
3 3
4 Maintained by Torben Mathiasen <device@lanana.org> 4 Maintained by Alan Cox <device@lanana.org>
5 5
6 Last revised: 29 November 2006 6 Last revised: 6th April 2009
7 7
8This list is the Linux Device List, the official registry of allocated 8This list is the Linux Device List, the official registry of allocated
9device numbers and /dev directory nodes for the Linux operating 9device numbers and /dev directory nodes for the Linux operating
@@ -67,6 +67,11 @@ up to date. Due to the number of registrations I have to maintain it
67in "batch mode", so there is likely additional registrations that 67in "batch mode", so there is likely additional registrations that
68haven't been listed yet. 68haven't been listed yet.
69 69
70Fourth, remember that Linux now has extensive support for dynamic allocation
71of device numbering and can use sysfs and udev to handle the naming needs.
72There are still some exceptions in the serial and boot device area. Before
73asking for a device number make sure you actually need one.
74
70Finally, sometimes I have to play "namespace police." Please don't be 75Finally, sometimes I have to play "namespace police." Please don't be
71offended. I often get submissions for /dev names that would be bound 76offended. I often get submissions for /dev names that would be bound
72to cause conflicts down the road. I am trying to avoid getting in a 77to cause conflicts down the road. I am trying to avoid getting in a
@@ -101,7 +106,7 @@ Your cooperation is appreciated.
101 0 = /dev/ram0 First RAM disk 106 0 = /dev/ram0 First RAM disk
102 1 = /dev/ram1 Second RAM disk 107 1 = /dev/ram1 Second RAM disk
103 ... 108 ...
104 250 = /dev/initrd Initial RAM disk {2.6} 109 250 = /dev/initrd Initial RAM disk
105 110
106 Older kernels had /dev/ramdisk (1, 1) here. 111 Older kernels had /dev/ramdisk (1, 1) here.
107 /dev/initrd refers to a RAM disk which was preloaded 112 /dev/initrd refers to a RAM disk which was preloaded
@@ -340,7 +345,7 @@ Your cooperation is appreciated.
340 14 = /dev/touchscreen/ucb1x00 UCB 1x00 touchscreen 345 14 = /dev/touchscreen/ucb1x00 UCB 1x00 touchscreen
341 15 = /dev/touchscreen/mk712 MK712 touchscreen 346 15 = /dev/touchscreen/mk712 MK712 touchscreen
342 128 = /dev/beep Fancy beep device 347 128 = /dev/beep Fancy beep device
343 129 = /dev/modreq Kernel module load request {2.6} 348 129 =
344 130 = /dev/watchdog Watchdog timer port 349 130 = /dev/watchdog Watchdog timer port
345 131 = /dev/temperature Machine internal temperature 350 131 = /dev/temperature Machine internal temperature
346 132 = /dev/hwtrap Hardware fault trap 351 132 = /dev/hwtrap Hardware fault trap
@@ -350,10 +355,10 @@ Your cooperation is appreciated.
350 139 = /dev/openprom SPARC OpenBoot PROM 355 139 = /dev/openprom SPARC OpenBoot PROM
351 140 = /dev/relay8 Berkshire Products Octal relay card 356 140 = /dev/relay8 Berkshire Products Octal relay card
352 141 = /dev/relay16 Berkshire Products ISO-16 relay card 357 141 = /dev/relay16 Berkshire Products ISO-16 relay card
353 142 = /dev/msr x86 model-specific registers {2.6} 358 142 =
354 143 = /dev/pciconf PCI configuration space 359 143 = /dev/pciconf PCI configuration space
355 144 = /dev/nvram Non-volatile configuration RAM 360 144 = /dev/nvram Non-volatile configuration RAM
356 145 = /dev/hfmodem Soundcard shortwave modem control {2.6} 361 145 = /dev/hfmodem Soundcard shortwave modem control
357 146 = /dev/graphics Linux/SGI graphics device 362 146 = /dev/graphics Linux/SGI graphics device
358 147 = /dev/opengl Linux/SGI OpenGL pipe 363 147 = /dev/opengl Linux/SGI OpenGL pipe
359 148 = /dev/gfx Linux/SGI graphics effects device 364 148 = /dev/gfx Linux/SGI graphics effects device
@@ -435,6 +440,9 @@ Your cooperation is appreciated.
435 228 = /dev/hpet HPET driver 440 228 = /dev/hpet HPET driver
436 229 = /dev/fuse Fuse (virtual filesystem in user-space) 441 229 = /dev/fuse Fuse (virtual filesystem in user-space)
437 230 = /dev/midishare MidiShare driver 442 230 = /dev/midishare MidiShare driver
443 231 = /dev/snapshot System memory snapshot device
444 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions)
445 233 = /dev/kmview View-OS A process with a view
438 240-254 Reserved for local use 446 240-254 Reserved for local use
439 255 Reserved for MISC_DYNAMIC_MINOR 447 255 Reserved for MISC_DYNAMIC_MINOR
440 448
@@ -466,10 +474,7 @@ Your cooperation is appreciated.
466 The device names specified are proposed -- if there 474 The device names specified are proposed -- if there
467 are "standard" names for these devices, please let me know. 475 are "standard" names for these devices, please let me know.
468 476
469 12 block MSCDEX CD-ROM callback support {2.6} 477 12 block
470 0 = /dev/dos_cd0 First MSCDEX CD-ROM
471 1 = /dev/dos_cd1 Second MSCDEX CD-ROM
472 ...
473 478
474 13 char Input core 479 13 char Input core
475 0 = /dev/input/js0 First joystick 480 0 = /dev/input/js0 First joystick
@@ -498,7 +503,7 @@ Your cooperation is appreciated.
498 2 = /dev/midi00 First MIDI port 503 2 = /dev/midi00 First MIDI port
499 3 = /dev/dsp Digital audio 504 3 = /dev/dsp Digital audio
500 4 = /dev/audio Sun-compatible digital audio 505 4 = /dev/audio Sun-compatible digital audio
501 6 = /dev/sndstat Sound card status information {2.6} 506 6 =
502 7 = /dev/audioctl SPARC audio control device 507 7 = /dev/audioctl SPARC audio control device
503 8 = /dev/sequencer2 Sequencer -- alternate device 508 8 = /dev/sequencer2 Sequencer -- alternate device
504 16 = /dev/mixer1 Second soundcard mixer control 509 16 = /dev/mixer1 Second soundcard mixer control
@@ -510,14 +515,7 @@ Your cooperation is appreciated.
510 34 = /dev/midi02 Third MIDI port 515 34 = /dev/midi02 Third MIDI port
511 50 = /dev/midi03 Fourth MIDI port 516 50 = /dev/midi03 Fourth MIDI port
512 517
513 14 block BIOS harddrive callback support {2.6} 518 14 block
514 0 = /dev/dos_hda First BIOS harddrive whole disk
515 64 = /dev/dos_hdb Second BIOS harddrive whole disk
516 128 = /dev/dos_hdc Third BIOS harddrive whole disk
517 192 = /dev/dos_hdd Fourth BIOS harddrive whole disk
518
519 Partitions are handled in the same way as IDE disks
520 (see major number 3).
521 519
522 15 char Joystick 520 15 char Joystick
523 0 = /dev/js0 First analog joystick 521 0 = /dev/js0 First analog joystick
@@ -535,14 +533,14 @@ Your cooperation is appreciated.
535 16 block GoldStar CD-ROM 533 16 block GoldStar CD-ROM
536 0 = /dev/gscd GoldStar CD-ROM 534 0 = /dev/gscd GoldStar CD-ROM
537 535
538 17 char Chase serial card 536 17 char OBSOLETE (was Chase serial card)
539 0 = /dev/ttyH0 First Chase port 537 0 = /dev/ttyH0 First Chase port
540 1 = /dev/ttyH1 Second Chase port 538 1 = /dev/ttyH1 Second Chase port
541 ... 539 ...
542 17 block Optics Storage CD-ROM 540 17 block Optics Storage CD-ROM
543 0 = /dev/optcd Optics Storage CD-ROM 541 0 = /dev/optcd Optics Storage CD-ROM
544 542
545 18 char Chase serial card - alternate devices 543 18 char OBSOLETE (was Chase serial card - alternate devices)
546 0 = /dev/cuh0 Callout device for ttyH0 544 0 = /dev/cuh0 Callout device for ttyH0
547 1 = /dev/cuh1 Callout device for ttyH1 545 1 = /dev/cuh1 Callout device for ttyH1
548 ... 546 ...
@@ -644,8 +642,7 @@ Your cooperation is appreciated.
644 2 = /dev/sbpcd2 Panasonic CD-ROM controller 0 unit 2 642 2 = /dev/sbpcd2 Panasonic CD-ROM controller 0 unit 2
645 3 = /dev/sbpcd3 Panasonic CD-ROM controller 0 unit 3 643 3 = /dev/sbpcd3 Panasonic CD-ROM controller 0 unit 3
646 644
647 26 char Quanta WinVision frame grabber {2.6} 645 26 char
648 0 = /dev/wvisfgrab Quanta WinVision frame grabber
649 646
650 26 block Second Matsushita (Panasonic/SoundBlaster) CD-ROM 647 26 block Second Matsushita (Panasonic/SoundBlaster) CD-ROM
651 0 = /dev/sbpcd4 Panasonic CD-ROM controller 1 unit 0 648 0 = /dev/sbpcd4 Panasonic CD-ROM controller 1 unit 0
@@ -872,7 +869,7 @@ Your cooperation is appreciated.
872 and "user level packet I/O." This board is also 869 and "user level packet I/O." This board is also
873 accessible as a standard networking "eth" device. 870 accessible as a standard networking "eth" device.
874 871
875 38 block Reserved for Linux/AP+ 872 38 block OBSOLETE (was Linux/AP+)
876 873
877 39 char ML-16P experimental I/O board 874 39 char ML-16P experimental I/O board
878 0 = /dev/ml16pa-a0 First card, first analog channel 875 0 = /dev/ml16pa-a0 First card, first analog channel
@@ -892,29 +889,16 @@ Your cooperation is appreciated.
892 50 = /dev/ml16pb-c1 Second card, second counter/timer 889 50 = /dev/ml16pb-c1 Second card, second counter/timer
893 51 = /dev/ml16pb-c2 Second card, third counter/timer 890 51 = /dev/ml16pb-c2 Second card, third counter/timer
894 ... 891 ...
895 39 block Reserved for Linux/AP+ 892 39 block
896 893
897 40 char Matrox Meteor frame grabber {2.6} 894 40 char
898 0 = /dev/mmetfgrab Matrox Meteor frame grabber
899 895
900 40 block Syquest EZ135 parallel port removable drive 896 40 block
901 0 = /dev/eza Parallel EZ135 drive, whole disk
902
903 This device is obsolete and will be removed in a
904 future version of Linux. It has been replaced with
905 the parallel port IDE disk driver at major number 45.
906 Partitions are handled in the same way as IDE disks
907 (see major number 3).
908 897
909 41 char Yet Another Micro Monitor 898 41 char Yet Another Micro Monitor
910 0 = /dev/yamm Yet Another Micro Monitor 899 0 = /dev/yamm Yet Another Micro Monitor
911 900
912 41 block MicroSolutions BackPack parallel port CD-ROM 901 41 block
913 0 = /dev/bpcd BackPack CD-ROM
914
915 This device is obsolete and will be removed in a
916 future version of Linux. It has been replaced with
917 the parallel port ATAPI CD-ROM driver at major number 46.
918 902
919 42 char Demo/sample use 903 42 char Demo/sample use
920 904
@@ -1681,13 +1665,7 @@ Your cooperation is appreciated.
1681 disks (see major number 3) except that the limit on 1665 disks (see major number 3) except that the limit on
1682 partitions is 15. 1666 partitions is 15.
1683 1667
1684 93 char IBM Smart Capture Card frame grabber {2.6} 1668 93 char
1685 0 = /dev/iscc0 First Smart Capture Card
1686 1 = /dev/iscc1 Second Smart Capture Card
1687 ...
1688 128 = /dev/isccctl0 First Smart Capture Card control
1689 129 = /dev/isccctl1 Second Smart Capture Card control
1690 ...
1691 1669
1692 93 block NAND Flash Translation Layer filesystem 1670 93 block NAND Flash Translation Layer filesystem
1693 0 = /dev/nftla First NFTL layer 1671 0 = /dev/nftla First NFTL layer
@@ -1695,10 +1673,7 @@ Your cooperation is appreciated.
1695 ... 1673 ...
1696 240 = /dev/nftlp 16th NTFL layer 1674 240 = /dev/nftlp 16th NTFL layer
1697 1675
1698 94 char miroVIDEO DC10/30 capture/playback device {2.6} 1676 94 char
1699 0 = /dev/dcxx0 First capture card
1700 1 = /dev/dcxx1 Second capture card
1701 ...
1702 1677
1703 94 block IBM S/390 DASD block storage 1678 94 block IBM S/390 DASD block storage
1704 0 = /dev/dasda First DASD device, major 1679 0 = /dev/dasda First DASD device, major
@@ -1791,11 +1766,7 @@ Your cooperation is appreciated.
1791 ... 1766 ...
1792 15 = /dev/amiraid/ar?p15 15th partition 1767 15 = /dev/amiraid/ar?p15 15th partition
1793 1768
1794102 char Philips SAA5249 Teletext signal decoder {2.6} 1769102 char
1795 0 = /dev/tlk0 First Teletext decoder
1796 1 = /dev/tlk1 Second Teletext decoder
1797 2 = /dev/tlk2 Third Teletext decoder
1798 3 = /dev/tlk3 Fourth Teletext decoder
1799 1770
1800102 block Compressed block device 1771102 block Compressed block device
1801 0 = /dev/cbd/a First compressed block device, whole device 1772 0 = /dev/cbd/a First compressed block device, whole device
@@ -1916,10 +1887,7 @@ Your cooperation is appreciated.
1916 DAC960 (see major number 48) except that the limit on 1887 DAC960 (see major number 48) except that the limit on
1917 partitions is 15. 1888 partitions is 15.
1918 1889
1919111 char Philips SAA7146-based audio/video card {2.6} 1890111 char
1920 0 = /dev/av0 First A/V card
1921 1 = /dev/av1 Second A/V card
1922 ...
1923 1891
1924111 block Compaq Next Generation Drive Array, eighth controller 1892111 block Compaq Next Generation Drive Array, eighth controller
1925 0 = /dev/cciss/c7d0 First logical drive, whole disk 1893 0 = /dev/cciss/c7d0 First logical drive, whole disk
@@ -2079,8 +2047,8 @@ Your cooperation is appreciated.
2079 ... 2047 ...
2080 2048
2081119 char VMware virtual network control 2049119 char VMware virtual network control
2082 0 = /dev/vmnet0 1st virtual network 2050 0 = /dev/vnet0 1st virtual network
2083 1 = /dev/vmnet1 2nd virtual network 2051 1 = /dev/vnet1 2nd virtual network
2084 ... 2052 ...
2085 2053
2086120-127 char LOCAL/EXPERIMENTAL USE 2054120-127 char LOCAL/EXPERIMENTAL USE
@@ -2450,7 +2418,7 @@ Your cooperation is appreciated.
2450 2 = /dev/raw/raw2 Second raw I/O device 2418 2 = /dev/raw/raw2 Second raw I/O device
2451 ... 2419 ...
2452 2420
2453163 char UNASSIGNED (was Radio Tech BIM-XXX-RS232 radio modem - see 51) 2421163 char
2454 2422
2455164 char Chase Research AT/PCI-Fast serial card 2423164 char Chase Research AT/PCI-Fast serial card
2456 0 = /dev/ttyCH0 AT/PCI-Fast board 0, port 0 2424 0 = /dev/ttyCH0 AT/PCI-Fast board 0, port 0
@@ -2542,6 +2510,12 @@ Your cooperation is appreciated.
2542 1 = /dev/clanvi1 Second cLAN adapter 2510 1 = /dev/clanvi1 Second cLAN adapter
2543 ... 2511 ...
2544 2512
2513179 block MMC block devices
2514 0 = /dev/mmcblk0 First SD/MMC card
2515 1 = /dev/mmcblk0p1 First partition on first MMC card
2516 8 = /dev/mmcblk1 Second SD/MMC card
2517 ...
2518
2545179 char CCube DVXChip-based PCI products 2519179 char CCube DVXChip-based PCI products
2546 0 = /dev/dvxirq0 First DVX device 2520 0 = /dev/dvxirq0 First DVX device
2547 1 = /dev/dvxirq1 Second DVX device 2521 1 = /dev/dvxirq1 Second DVX device
@@ -2560,6 +2534,9 @@ Your cooperation is appreciated.
2560 96 = /dev/usb/hiddev0 1st USB HID device 2534 96 = /dev/usb/hiddev0 1st USB HID device
2561 ... 2535 ...
2562 111 = /dev/usb/hiddev15 16th USB HID device 2536 111 = /dev/usb/hiddev15 16th USB HID device
2537 112 = /dev/usb/auer0 1st auerswald ISDN device
2538 ...
2539 127 = /dev/usb/auer15 16th auerswald ISDN device
2563 128 = /dev/usb/brlvgr0 First Braille Voyager device 2540 128 = /dev/usb/brlvgr0 First Braille Voyager device
2564 ... 2541 ...
2565 131 = /dev/usb/brlvgr3 Fourth Braille Voyager device 2542 131 = /dev/usb/brlvgr3 Fourth Braille Voyager device
@@ -2810,6 +2787,16 @@ Your cooperation is appreciated.
2810 ... 2787 ...
2811 190 = /dev/ttyUL3 Xilinx uartlite - port 3 2788 190 = /dev/ttyUL3 Xilinx uartlite - port 3
2812 191 = /dev/xvc0 Xen virtual console - port 0 2789 191 = /dev/xvc0 Xen virtual console - port 0
2790 192 = /dev/ttyPZ0 pmac_zilog - port 0
2791 ...
2792 195 = /dev/ttyPZ3 pmac_zilog - port 3
2793 196 = /dev/ttyTX0 TX39/49 serial port 0
2794 ...
2795 204 = /dev/ttyTX7 TX39/49 serial port 7
2796 205 = /dev/ttySC0 SC26xx serial port 0
2797 206 = /dev/ttySC1 SC26xx serial port 1
2798 207 = /dev/ttySC2 SC26xx serial port 2
2799 208 = /dev/ttySC3 SC26xx serial port 3
2813 2800
2814205 char Low-density serial ports (alternate device) 2801205 char Low-density serial ports (alternate device)
2815 0 = /dev/culu0 Callout device for ttyLU0 2802 0 = /dev/culu0 Callout device for ttyLU0
@@ -3145,6 +3132,14 @@ Your cooperation is appreciated.
3145 1 = /dev/blockrom1 Second ROM card's translation layer interface 3132 1 = /dev/blockrom1 Second ROM card's translation layer interface
3146 ... 3133 ...
3147 3134
3135259 block Block Extended Major
3136 Used dynamically to hold additional partition minor
3137 numbers and allow large numbers of partitions per device
3138
3139259 char FPGA configuration interfaces
3140 0 = /dev/icap0 First Xilinx internal configuration
3141 1 = /dev/icap1 Second Xilinx internal configuration
3142
3148260 char OSD (Object-based-device) SCSI Device 3143260 char OSD (Object-based-device) SCSI Device
3149 0 = /dev/osd0 First OSD Device 3144 0 = /dev/osd0 First OSD Device
3150 1 = /dev/osd1 Second OSD Device 3145 1 = /dev/osd1 Second OSD Device
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index f2e908d7f90d..2f21ecd4c205 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -25,7 +25,7 @@ use IO::Handle;
25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t", 25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t",
26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", 26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
27 "or51211", "or51132_qam", "or51132_vsb", "bluebird", 27 "or51211", "or51132_qam", "or51132_vsb", "bluebird",
28 "opera1"); 28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2" );
29 29
30# Check args 30# Check args
31syntax() if (scalar(@ARGV) != 1); 31syntax() if (scalar(@ARGV) != 1);
@@ -37,8 +37,8 @@ for ($i=0; $i < scalar(@components); $i++) {
37 $outfile = eval($cid); 37 $outfile = eval($cid);
38 die $@ if $@; 38 die $@ if $@;
39 print STDERR <<EOF; 39 print STDERR <<EOF;
40Firmware $outfile extracted successfully. 40Firmware(s) $outfile extracted successfully.
41Now copy it to either /usr/lib/hotplug/firmware or /lib/firmware 41Now copy it(they) to either /usr/lib/hotplug/firmware or /lib/firmware
42(depending on configuration of firmware hotplug). 42(depending on configuration of firmware hotplug).
43EOF 43EOF
44 exit(0); 44 exit(0);
@@ -345,6 +345,85 @@ sub or51211 {
345 $fwfile; 345 $fwfile;
346} 346}
347 347
348sub cx231xx {
349 my $fwfile = "v4l-cx231xx-avcore-01.fw";
350 my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
351 my $hash = "7d3bb956dc9df0eafded2b56ba57cc42";
352
353 checkstandard();
354
355 wgetfile($fwfile, $url);
356 verify($fwfile, $hash);
357
358 $fwfile;
359}
360
361sub cx18 {
362 my $url = "http://linuxtv.org/downloads/firmware/";
363
364 my %files = (
365 'v4l-cx23418-apu.fw' => '588f081b562f5c653a3db1ad8f65939a',
366 'v4l-cx23418-cpu.fw' => 'b6c7ed64bc44b1a6e0840adaeac39d79',
367 'v4l-cx23418-dig.fw' => '95bc688d3e7599fd5800161e9971cc55',
368 );
369
370 checkstandard();
371
372 my $allfiles;
373 foreach my $fwfile (keys %files) {
374 wgetfile($fwfile, "$url/$fwfile");
375 verify($fwfile, $files{$fwfile});
376 $allfiles .= " $fwfile";
377 }
378
379 $allfiles =~ s/^\s//;
380
381 $allfiles;
382}
383
384sub cx23885 {
385 my $url = "http://linuxtv.org/downloads/firmware/";
386
387 my %files = (
388 'v4l-cx23885-avcore-01.fw' => 'a9f8f5d901a7fb42f552e1ee6384f3bb',
389 'v4l-cx23885-enc.fw' => 'a9f8f5d901a7fb42f552e1ee6384f3bb',
390 );
391
392 checkstandard();
393
394 my $allfiles;
395 foreach my $fwfile (keys %files) {
396 wgetfile($fwfile, "$url/$fwfile");
397 verify($fwfile, $files{$fwfile});
398 $allfiles .= " $fwfile";
399 }
400
401 $allfiles =~ s/^\s//;
402
403 $allfiles;
404}
405
406sub pvrusb2 {
407 my $url = "http://linuxtv.org/downloads/firmware/";
408
409 my %files = (
410 'v4l-cx25840.fw' => 'dadb79e9904fc8af96e8111d9cb59320',
411 );
412
413 checkstandard();
414
415 my $allfiles;
416 foreach my $fwfile (keys %files) {
417 wgetfile($fwfile, "$url/$fwfile");
418 verify($fwfile, $files{$fwfile});
419 $allfiles .= " $fwfile";
420 }
421
422 $allfiles =~ s/^\s//;
423
424 $allfiles;
425}
426
348sub or51132_qam { 427sub or51132_qam {
349 my $fwfile = "dvb-fe-or51132-qam.fw"; 428 my $fwfile = "dvb-fe-or51132-qam.fw";
350 my $url = "http://linuxtv.org/downloads/firmware/$fwfile"; 429 my $url = "http://linuxtv.org/downloads/firmware/$fwfile";
diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
index caabbd395e61..a618fd99c9f0 100644
--- a/Documentation/fb/00-INDEX
+++ b/Documentation/fb/00-INDEX
@@ -11,8 +11,6 @@ aty128fb.txt
11 - info on the ATI Rage128 frame buffer driver. 11 - info on the ATI Rage128 frame buffer driver.
12cirrusfb.txt 12cirrusfb.txt
13 - info on the driver for Cirrus Logic chipsets. 13 - info on the driver for Cirrus Logic chipsets.
14cyblafb/
15 - directory with documentation files related to the cyblafb driver.
16deferred_io.txt 14deferred_io.txt
17 - an introduction to deferred IO. 15 - an introduction to deferred IO.
18fbcon.txt 16fbcon.txt
diff --git a/Documentation/fb/cyblafb/bugs b/Documentation/fb/cyblafb/bugs
deleted file mode 100644
index 9443a6d72cdd..000000000000
--- a/Documentation/fb/cyblafb/bugs
+++ /dev/null
@@ -1,13 +0,0 @@
1Bugs
2====
3
4I currently don't know of any bug. Please do send reports to:
5 - linux-fbdev-devel@lists.sourceforge.net
6 - Knut_Petersen@t-online.de.
7
8
9Untested features
10=================
11
12All LCD stuff is untested. If it worked in tridentfb, it should work in
13cyblafb. Please test and report the results to Knut_Petersen@t-online.de.
diff --git a/Documentation/fb/cyblafb/credits b/Documentation/fb/cyblafb/credits
deleted file mode 100644
index 0eb3b443dc2b..000000000000
--- a/Documentation/fb/cyblafb/credits
+++ /dev/null
@@ -1,7 +0,0 @@
1Thanks to
2=========
3 * Alan Hourihane, for writing the X trident driver
4 * Jani Monoses, for writing the tridentfb driver
5 * Antonino A. Daplas, for review of the first published
6 version of cyblafb and some code
7 * Jochen Hein, for testing and a helpfull bug report
diff --git a/Documentation/fb/cyblafb/documentation b/Documentation/fb/cyblafb/documentation
deleted file mode 100644
index bb1aac048425..000000000000
--- a/Documentation/fb/cyblafb/documentation
+++ /dev/null
@@ -1,17 +0,0 @@
1Available Documentation
2=======================
3
4Apollo PLE 133 Chipset VT8601A North Bridge Datasheet, Rev. 1.82, October 22,
52001, available from VIA:
6
7 http://www.viavpsd.com/product/6/15/DS8601A182.pdf
8
9The datasheet is incomplete, some registers that need to be programmed are not
10explained at all and important bits are listed as "reserved". But you really
11need the datasheet to understand the code. "p. xxx" comments refer to page
12numbers of this document.
13
14XFree/XOrg drivers are available and of good quality, looking at the code
15there is a good idea if the datasheet does not provide enough information
16or if the datasheet seems to be wrong.
17
diff --git a/Documentation/fb/cyblafb/fb.modes b/Documentation/fb/cyblafb/fb.modes
deleted file mode 100644
index fe0e5223ba86..000000000000
--- a/Documentation/fb/cyblafb/fb.modes
+++ /dev/null
@@ -1,154 +0,0 @@
1#
2# Sample fb.modes file
3#
4# Provides an incomplete list of working modes for
5# the cyberblade/i1 graphics core.
6#
7# The value 4294967256 is used instead of -40. Of course, -40 is not
8# a really reasonable value, but chip design does not always follow
9# logic. Believe me, it's ok, and it's the way the BIOS does it.
10#
11# fbset requires 4294967256 in fb.modes and -40 as an argument to
12# the -t parameter. That's also not too reasonable, and it might change
13# in the future or might even be differt for your current version.
14#
15
16mode "640x480-50"
17 geometry 640 480 2048 4096 8
18 timings 47619 4294967256 24 17 0 216 3
19endmode
20
21mode "640x480-60"
22 geometry 640 480 2048 4096 8
23 timings 39682 4294967256 24 17 0 216 3
24endmode
25
26mode "640x480-70"
27 geometry 640 480 2048 4096 8
28 timings 34013 4294967256 24 17 0 216 3
29endmode
30
31mode "640x480-72"
32 geometry 640 480 2048 4096 8
33 timings 33068 4294967256 24 17 0 216 3
34endmode
35
36mode "640x480-75"
37 geometry 640 480 2048 4096 8
38 timings 31746 4294967256 24 17 0 216 3
39endmode
40
41mode "640x480-80"
42 geometry 640 480 2048 4096 8
43 timings 29761 4294967256 24 17 0 216 3
44endmode
45
46mode "640x480-85"
47 geometry 640 480 2048 4096 8
48 timings 28011 4294967256 24 17 0 216 3
49endmode
50
51mode "800x600-50"
52 geometry 800 600 2048 4096 8
53 timings 30303 96 24 14 0 136 11
54endmode
55
56mode "800x600-60"
57 geometry 800 600 2048 4096 8
58 timings 25252 96 24 14 0 136 11
59endmode
60
61mode "800x600-70"
62 geometry 800 600 2048 4096 8
63 timings 21645 96 24 14 0 136 11
64endmode
65
66mode "800x600-72"
67 geometry 800 600 2048 4096 8
68 timings 21043 96 24 14 0 136 11
69endmode
70
71mode "800x600-75"
72 geometry 800 600 2048 4096 8
73 timings 20202 96 24 14 0 136 11
74endmode
75
76mode "800x600-80"
77 geometry 800 600 2048 4096 8
78 timings 18939 96 24 14 0 136 11
79endmode
80
81mode "800x600-85"
82 geometry 800 600 2048 4096 8
83 timings 17825 96 24 14 0 136 11
84endmode
85
86mode "1024x768-50"
87 geometry 1024 768 2048 4096 8
88 timings 19054 144 24 29 0 120 3
89endmode
90
91mode "1024x768-60"
92 geometry 1024 768 2048 4096 8
93 timings 15880 144 24 29 0 120 3
94endmode
95
96mode "1024x768-70"
97 geometry 1024 768 2048 4096 8
98 timings 13610 144 24 29 0 120 3
99endmode
100
101mode "1024x768-72"
102 geometry 1024 768 2048 4096 8
103 timings 13232 144 24 29 0 120 3
104endmode
105
106mode "1024x768-75"
107 geometry 1024 768 2048 4096 8
108 timings 12703 144 24 29 0 120 3
109endmode
110
111mode "1024x768-80"
112 geometry 1024 768 2048 4096 8
113 timings 11910 144 24 29 0 120 3
114endmode
115
116mode "1024x768-85"
117 geometry 1024 768 2048 4096 8
118 timings 11209 144 24 29 0 120 3
119endmode
120
121mode "1280x1024-50"
122 geometry 1280 1024 2048 4096 8
123 timings 11114 232 16 39 0 160 3
124endmode
125
126mode "1280x1024-60"
127 geometry 1280 1024 2048 4096 8
128 timings 9262 232 16 39 0 160 3
129endmode
130
131mode "1280x1024-70"
132 geometry 1280 1024 2048 4096 8
133 timings 7939 232 16 39 0 160 3
134endmode
135
136mode "1280x1024-72"
137 geometry 1280 1024 2048 4096 8
138 timings 7719 232 16 39 0 160 3
139endmode
140
141mode "1280x1024-75"
142 geometry 1280 1024 2048 4096 8
143 timings 7410 232 16 39 0 160 3
144endmode
145
146mode "1280x1024-80"
147 geometry 1280 1024 2048 4096 8
148 timings 6946 232 16 39 0 160 3
149endmode
150
151mode "1280x1024-85"
152 geometry 1280 1024 2048 4096 8
153 timings 6538 232 16 39 0 160 3
154endmode
diff --git a/Documentation/fb/cyblafb/performance b/Documentation/fb/cyblafb/performance
deleted file mode 100644
index 8d15d5dfc6b3..000000000000
--- a/Documentation/fb/cyblafb/performance
+++ /dev/null
@@ -1,79 +0,0 @@
1Speed
2=====
3
4CyBlaFB is much faster than tridentfb and vesafb. Compare the performance data
5for mode 1280x1024-[8,16,32]@61 Hz.
6
7Test 1: Cat a file with 2000 lines of 0 characters.
8Test 2: Cat a file with 2000 lines of 80 characters.
9Test 3: Cat a file with 2000 lines of 160 characters.
10
11All values show system time use in seconds, kernel 2.6.12 was used for
12the measurements. 2.6.13 is a bit slower, 2.6.14 hopefully will include a
13patch that speeds up kernel bitblitting a lot ( > 20%).
14
15+-----------+-----------------------------------------------------+
16| | not accelerated |
17| TRIDENTFB +-----------------+-----------------+-----------------+
18| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
19| | noypan | ypan | noypan | ypan | noypan | ypan |
20+-----------+--------+--------+--------+--------+--------+--------+
21| Test 1 | 4.31 | 4.33 | 6.05 | 12.81 | ---- | ---- |
22| Test 2 | 67.94 | 5.44 | 123.16 | 14.79 | ---- | ---- |
23| Test 3 | 131.36 | 6.55 | 240.12 | 16.76 | ---- | ---- |
24+-----------+--------+--------+--------+--------+--------+--------+
25| Comments | | | completely bro- |
26| | | | ken, monitor |
27| | | | switches off |
28+-----------+-----------------+-----------------+-----------------+
29
30
31+-----------+-----------------------------------------------------+
32| | accelerated |
33| TRIDENTFB +-----------------+-----------------+-----------------+
34| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
35| | noypan | ypan | noypan | ypan | noypan | ypan |
36+-----------+--------+--------+--------+--------+--------+--------+
37| Test 1 | ---- | ---- | 20.62 | 1.22 | ---- | ---- |
38| Test 2 | ---- | ---- | 22.61 | 3.19 | ---- | ---- |
39| Test 3 | ---- | ---- | 24.59 | 5.16 | ---- | ---- |
40+-----------+--------+--------+--------+--------+--------+--------+
41| Comments | broken, writing | broken, ok only | completely bro- |
42| | to wrong places | if bgcolor is | ken, monitor |
43| | on screen + bug | black, bug in | switches off |
44| | in fillrect() | fillrect() | |
45+-----------+-----------------+-----------------+-----------------+
46
47
48+-----------+-----------------------------------------------------+
49| | not accelerated |
50| VESAFB +-----------------+-----------------+-----------------+
51| of 2.6.12 | 8 bpp | 16 bpp | 32 bpp |
52| | noypan | ypan | noypan | ypan | noypan | ypan |
53+-----------+--------+--------+--------+--------+--------+--------+
54| Test 1 | 4.26 | 3.76 | 5.99 | 7.23 | ---- | ---- |
55| Test 2 | 65.65 | 4.89 | 120.88 | 9.08 | ---- | ---- |
56| Test 3 | 126.91 | 5.94 | 235.77 | 11.03 | ---- | ---- |
57+-----------+--------+--------+--------+--------+--------+--------+
58| Comments | vga=0x307 | vga=0x31a | vga=0x31b not |
59| | fh=80kHz | fh=80kHz | supported by |
60| | fv=75kHz | fv=75kHz | video BIOS and |
61| | | | hardware |
62+-----------+-----------------+-----------------+-----------------+
63
64
65+-----------+-----------------------------------------------------+
66| | accelerated |
67| CYBLAFB +-----------------+-----------------+-----------------+
68| | 8 bpp | 16 bpp | 32 bpp |
69| | noypan | ypan | noypan | ypan | noypan | ypan |
70+-----------+--------+--------+--------+--------+--------+--------+
71| Test 1 | 8.02 | 0.23 | 19.04 | 0.61 | 57.12 | 2.74 |
72| Test 2 | 8.38 | 0.55 | 19.39 | 0.92 | 57.54 | 3.13 |
73| Test 3 | 8.73 | 0.86 | 19.74 | 1.24 | 57.95 | 3.51 |
74+-----------+--------+--------+--------+--------+--------+--------+
75| Comments | | | |
76| | | | |
77| | | | |
78| | | | |
79+-----------+-----------------+-----------------+-----------------+
diff --git a/Documentation/fb/cyblafb/todo b/Documentation/fb/cyblafb/todo
deleted file mode 100644
index c5f6d0eae545..000000000000
--- a/Documentation/fb/cyblafb/todo
+++ /dev/null
@@ -1,31 +0,0 @@
1TODO / Missing features
2=======================
3
4Verify LCD stuff "stretch" and "center" options are
5 completely untested ... this code needs to be
6 verified. As I don't have access to such
7 hardware, please contact me if you are
8 willing run some tests.
9
10Interlaced video modes The reason that interleaved
11 modes are disabled is that I do not know
12 the meaning of the vertical interlace
13 parameter. Also the datasheet mentions a
14 bit d8 of a horizontal interlace parameter,
15 but nowhere the lower 8 bits. Please help
16 if you can.
17
18low-res double scan modes Who needs it?
19
20accelerated color blitting Who needs it? The console driver does use color
21 blitting for nothing but drawing the penguine,
22 everything else is done using color expanding
23 blitting of 1bpp character bitmaps.
24
25ioctls Who needs it?
26
27TV-out Will be done later. Use "vga= " at boot time
28 to set a suitable video mode.
29
30??? Feel free to contact me if you have any
31 feature requests
diff --git a/Documentation/fb/cyblafb/usage b/Documentation/fb/cyblafb/usage
deleted file mode 100644
index a39bb3d402a2..000000000000
--- a/Documentation/fb/cyblafb/usage
+++ /dev/null
@@ -1,217 +0,0 @@
1CyBlaFB is a framebuffer driver for the Cyberblade/i1 graphics core integrated
2into the VIA Apollo PLE133 (aka vt8601) south bridge. It is developed and
3tested using a VIA EPIA 5000 board.
4
5Cyblafb - compiled into the kernel or as a module?
6==================================================
7
8You might compile cyblafb either as a module or compile it permanently into the
9kernel.
10
11Unless you have a real reason to do so you should not compile both vesafb and
12cyblafb permanently into the kernel. It's possible and it helps during the
13developement cycle, but it's useless and will at least block some otherwise
14usefull memory for ordinary users.
15
16Selecting Modes
17===============
18
19 Startup Mode
20 ============
21
22 First of all, you might use the "vga=???" boot parameter as it is
23 documented in vesafb.txt and svga.txt. Cyblafb will detect the video
24 mode selected and will use the geometry and timings found by
25 inspecting the hardware registers.
26
27 video=cyblafb vga=0x317
28
29 Alternatively you might use a combination of the mode, ref and bpp
30 parameters. If you compiled the driver into the kernel, add something
31 like this to the kernel command line:
32
33 video=cyblafb:1280x1024,bpp=16,ref=50 ...
34
35 If you compiled the driver as a module, the same mode would be
36 selected by the following command:
37
38 modprobe cyblafb mode=1280x1024 bpp=16 ref=50 ...
39
40 None of the modes possible to select as startup modes are affected by
41 the problems described at the end of the next subsection.
42
43 For all startup modes cyblafb chooses a virtual x resolution of 2048,
44 the only exception is mode 1280x1024 in combination with 32 bpp. This
45 allows ywrap scrolling for all those modes if rotation is 0 or 2, and
46 also fast scrolling if rotation is 1 or 3. The default virtual y reso-
47 lution is 4096 for bpp == 8, 2048 for bpp==16 and 1024 for bpp == 32,
48 again with the only exception of 1280x1024 at 32 bpp.
49
50 Please do set your video memory size to 8 Mb in the Bios setup. Other
51 values will work, but performace is decreased for a lot of modes.
52
53 Mode changes using fbset
54 ========================
55
56 You might use fbset to change the video mode, see "man fbset". Cyblafb
57 generally does assume that you know what you are doing. But it does
58 some checks, especially those that are needed to prevent you from
59 damaging your hardware.
60
61 - only 8, 16, 24 and 32 bpp video modes are accepted
62 - interlaced video modes are not accepted
63 - double scan video modes are not accepted
64 - if a flat panel is found, cyblafb does not allow you
65 to program a resolution higher than the physical
66 resolution of the flat panel monitor
67 - cyblafb does not allow vclk to exceed 230 MHz. As 32 bpp
68 and (currently) 24 bit modes use a doubled vclk internally,
69 the dotclock limit as seen by fbset is 115 MHz for those
70 modes and 230 MHz for 8 and 16 bpp modes.
71 - cyblafb will allow you to select very high resolutions as
72 long as the hardware can be programmed to these modes. The
73 documented limit 1600x1200 is not enforced, but don't expect
74 perfect signal quality.
75
76 Any request that violates the rules given above will be either changed
77 to something the hardware supports or an error value will be returned.
78
79 If you program a virtual y resolution higher than the hardware limit,
80 cyblafb will silently decrease that value to the highest possible
81 value. The same is true for a virtual x resolution that is not
82 supported by the hardware. Cyblafb tries to adapt vyres first because
83 vxres decides if ywrap scrolling is possible or not.
84
85 Attempts to disable acceleration are ignored, I believe that this is
86 safe.
87
88 Some video modes that should work do not work as expected. If you use
89 the standard fb.modes, fbset 640x480-60 will program that mode, but
90 you will see a vertical area, about two characters wide, with only
91 much darker characters than the other characters on the screen.
92 Cyblafb does allow that mode to be set, as it does not violate the
93 official specifications. It would need a lot of code to reliably sort
94 out all invalid modes, playing around with the margin values will
95 give a valid mode quickly. And if cyblafb would detect such an invalid
96 mode, should it silently alter the requested values or should it
97 report an error? Both options have some pros and cons. As stated
98 above, none of the startup modes are affected, and if you set
99 verbosity to 1 or higher, cyblafb will print the fbset command that
100 would be needed to program that mode using fbset.
101
102
103Other Parameters
104================
105
106
107crt don't autodetect, assume monitor connected to
108 standard VGA connector
109
110fp don't autodetect, assume flat panel display
111 connected to flat panel monitor interface
112
113nativex inform driver about native x resolution of
114 flat panel monitor connected to special
115 interface (should be autodetected)
116
117stretch stretch image to adapt low resolution modes to
118 higer resolutions of flat panel monitors
119 connected to special interface
120
121center center image to adapt low resolution modes to
122 higer resolutions of flat panel monitors
123 connected to special interface
124
125memsize use if autodetected memsize is wrong ...
126 should never be necessary
127
128nopcirr disable PCI read retry
129nopciwr disable PCI write retry
130nopcirb disable PCI read bursts
131nopciwb disable PCI write bursts
132
133bpp bpp for specified modes
134 valid values: 8 || 16 || 24 || 32
135
136ref refresh rate for specified mode
137 valid values: 50 <= ref <= 85
138
139mode 640x480 or 800x600 or 1024x768 or 1280x1024
140 if not specified, the startup mode will be detected
141 and used, so you might also use the vga=??? parameter
142 described in vesafb.txt. If you do not specify a mode,
143 bpp and ref parameters are ignored.
144
145verbosity 0 is the default, increase to at least 2 for every
146 bug report!
147
148Development hints
149=================
150
151It's much faster do compile a module and to load the new version after
152unloading the old module than to compile a new kernel and to reboot. So if you
153try to work on cyblafb, it might be a good idea to use cyblafb as a module.
154In real life, fast often means dangerous, and that's also the case here. If
155you introduce a serious bug when cyblafb is compiled into the kernel, the
156kernel will lock or oops with a high probability before the file system is
157mounted, and the danger for your data is low. If you load a broken own version
158of cyblafb on a running system, the danger for the integrity of the file
159system is much higher as you might need a hard reset afterwards. Decide
160yourself.
161
162Module unloading, the vfb method
163================================
164
165If you want to unload/reload cyblafb using the virtual framebuffer, you need
166to enable vfb support in the kernel first. After that, load the modules as
167shown below:
168
169 modprobe vfb vfb_enable=1
170 modprobe fbcon
171 modprobe cyblafb
172 fbset -fb /dev/fb1 1280x1024-60 -vyres 2662
173 con2fb /dev/fb1 /dev/tty1
174 ...
175
176If you now made some changes to cyblafb and want to reload it, you might do it
177as show below:
178
179 con2fb /dev/fb0 /dev/tty1
180 ...
181 rmmod cyblafb
182 modprobe cyblafb
183 con2fb /dev/fb1 /dev/tty1
184 ...
185
186Of course, you might choose another mode, and most certainly you also want to
187map some other /dev/tty* to the real framebuffer device. You might also choose
188to compile fbcon as a kernel module or place it permanently in the kernel.
189
190I do not know of any way to unload fbcon, and fbcon will prevent the
191framebuffer device loaded first from unloading. [If there is a way, then
192please add a description here!]
193
194Module unloading, the vesafb method
195===================================
196
197Configure the kernel:
198
199 <*> Support for frame buffer devices
200 [*] VESA VGA graphics support
201 <M> Cyberblade/i1 support
202
203Add e.g. "video=vesafb:ypan vga=0x307" to the kernel parameters. The ypan
204parameter is important, choose any vga parameter you like as long as it is
205a graphics mode.
206
207After booting, load cyblafb without any mode and bpp parameter and assign
208cyblafb to individual ttys using con2fb, e.g.:
209
210 modprobe cyblafb
211 con2fb /dev/fb1 /dev/tty1
212
213Unloading cyblafb works without problems after you assign vesafb to all
214ttys again, e.g.:
215
216 con2fb /dev/fb0 /dev/tty1
217 rmmod cyblafb
diff --git a/Documentation/fb/cyblafb/whatsnew b/Documentation/fb/cyblafb/whatsnew
deleted file mode 100644
index 76c07a26e044..000000000000
--- a/Documentation/fb/cyblafb/whatsnew
+++ /dev/null
@@ -1,29 +0,0 @@
10.62
2====
3
4 - the vesafb parameter has been removed as I decided to allow the
5 feature without any special parameter.
6
7 - Cyblafb does not use the vga style of panning any longer, now the
8 "right view" register in the graphics engine IO space is used. Without
9 that change it was impossible to use all available memory, and without
10 access to all available memory it is impossible to ywrap.
11
12 - The imageblit function now uses hardware acceleration for all font
13 widths. Hardware blitting across pixel column 2048 is broken in the
14 cyberblade/i1 graphics core, but we work around that hardware bug.
15
16 - modes with vxres != xres are supported now.
17
18 - ywrap scrolling is supported now and the default. This is a big
19 performance gain.
20
21 - default video modes use vyres > yres and vxres > xres to allow
22 almost optimal scrolling speed for normal and rotated screens
23
24 - some features mainly usefull for debugging the upper layers of the
25 framebuffer system have been added, have a look at the code
26
27 - fixed: Oops after unloading cyblafb when reading /proc/io*
28
29 - we work around some bugs of the higher framebuffer layers.
diff --git a/Documentation/fb/cyblafb/whycyblafb b/Documentation/fb/cyblafb/whycyblafb
deleted file mode 100644
index a123bc11e698..000000000000
--- a/Documentation/fb/cyblafb/whycyblafb
+++ /dev/null
@@ -1,85 +0,0 @@
1I tried the following framebuffer drivers:
2
3 - TRIDENTFB is full of bugs. Acceleration is broken for Blade3D
4 graphics cores like the cyberblade/i1. It claims to support a great
5 number of devices, but documentation for most of these devices is
6 unfortunately not available. There is _no_ reason to use tridentfb
7 for cyberblade/i1 + CRT users. VESAFB is faster, and the one
8 advantage, mode switching, is broken in tridentfb.
9
10 - VESAFB is used by many distributions as a standard. Vesafb does
11 not support mode switching. VESAFB is a bit faster than the working
12 configurations of TRIDENTFB, but it is still too slow, even if you
13 use ypan.
14
15 - EPIAFB (you'll find it on sourceforge) supports the Cyberblade/i1
16 graphics core, but it still has serious bugs and developement seems
17 to have stopped. This is the one driver with TV-out support. If you
18 do need this feature, try epiafb.
19
20None of these drivers was a real option for me.
21
22I believe that is unreasonable to change code that announces to support 20
23devices if I only have more or less sufficient documentation for exactly one
24of these. The risk of breaking device foo while fixing device bar is too high.
25
26So I decided to start CyBlaFB as a stripped down tridentfb.
27
28All code specific to other Trident chips has been removed. After that there
29were a lot of cosmetic changes to increase the readability of the code. All
30register names were changed to those mnemonics used in the datasheet. Function
31and macro names were changed if they hindered easy understanding of the code.
32
33After that I debugged the code and implemented some new features. I'll try to
34give a little summary of the main changes:
35
36 - calculation of vertical and horizontal timings was fixed
37
38 - video signal quality has been improved dramatically
39
40 - acceleration:
41
42 - fillrect and copyarea were fixed and reenabled
43
44 - color expanding imageblit was newly implemented, color
45 imageblit (only used to draw the penguine) still uses the
46 generic code.
47
48 - init of the acceleration engine was improved and moved to a
49 place where it really works ...
50
51 - sync function has a timeout now and tries to reset and
52 reinit the accel engine if necessary
53
54 - fewer slow copyarea calls when doing ypan scrolling by using
55 undocumented bit d21 of screen start address stored in
56 CR2B[5]. BIOS does use it also, so this should be safe.
57
58 - cyblafb rejects any attempt to set modes that would cause vclk
59 values above reasonable 230 MHz. 32bit modes use a clock
60 multiplicator of 2, so fbset does show the correct values for
61 pixclock but not for vclk in this case. The fbset limit is 115 MHz
62 for 32 bpp modes.
63
64 - cyblafb rejects modes known to be broken or unimplemented (all
65 interlaced modes, all doublescan modes for now)
66
67 - cyblafb now works independant of the video mode in effect at startup
68 time (tridentfb does not init all needed registers to reasonable
69 values)
70
71 - switching between video modes does work reliably now
72
73 - the first video mode now is the one selected on startup using the
74 vga=???? mechanism or any of
75 - 640x480, 800x600, 1024x768, 1280x1024
76 - 8, 16, 24 or 32 bpp
77 - refresh between 50 Hz and 85 Hz, 1 Hz steps (1280x1024-32
78 is limited to 63Hz)
79
80 - pci retry and pci burst mode are settable (try to disable if you
81 experience latency problems)
82
83 - built as a module cyblafb might be unloaded and reloaded using
84 the vfb module and con2vt or might be used together with vesafb
85
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7907586c6e08..7e2af10e8264 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -64,10 +64,10 @@ Who: Pavel Machek <pavel@suse.cz>
64 64
65--------------------------- 65---------------------------
66 66
67What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. 67What: Video4Linux API 1 ioctls and from Video devices.
68When: December 2008 68When: July 2009
69Files: include/linux/video_decoder.h include/linux/videodev.h 69Files: include/linux/videodev.h
70Check: include/linux/video_decoder.h include/linux/videodev.h 70Check: include/linux/videodev.h
71Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6 71Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
72 series. The old API have lots of drawbacks and don't provide enough 72 series. The old API have lots of drawbacks and don't provide enough
73 means to work with all video and audio standards. The newer API is 73 means to work with all video and audio standards. The newer API is
@@ -255,6 +255,16 @@ Who: Jan Engelhardt <jengelh@computergmbh.de>
255 255
256--------------------------- 256---------------------------
257 257
258What: GPIO autorequest on gpio_direction_{input,output}() in gpiolib
259When: February 2010
260Why: All callers should use explicit gpio_request()/gpio_free().
261 The autorequest mechanism in gpiolib was provided mostly as a
262 migration aid for legacy GPIO interfaces (for SOC based GPIOs).
263 Those users have now largely migrated. Platforms implementing
264 the GPIO interfaces without using gpiolib will see no changes.
265Who: David Brownell <dbrownell@users.sourceforge.net>
266---------------------------
267
258What: b43 support for firmware revision < 410 268What: b43 support for firmware revision < 410
259When: The schedule was July 2008, but it was decided that we are going to keep the 269When: The schedule was July 2008, but it was decided that we are going to keep the
260 code as long as there are no major maintanance headaches. 270 code as long as there are no major maintanance headaches.
@@ -273,13 +283,6 @@ Who: Glauber Costa <gcosta@redhat.com>
273 283
274--------------------------- 284---------------------------
275 285
276What: remove HID compat support
277When: 2.6.29
278Why: needed only as a temporary solution until distros fix themselves up
279Who: Jiri Slaby <jirislaby@gmail.com>
280
281---------------------------
282
283What: print_fn_descriptor_symbol() 286What: print_fn_descriptor_symbol()
284When: October 2009 287When: October 2009
285Why: The %pF vsprintf format provides the same functionality in a 288Why: The %pF vsprintf format provides the same functionality in a
@@ -311,6 +314,18 @@ Who: Vlad Yasevich <vladislav.yasevich@hp.com>
311 314
312--------------------------- 315---------------------------
313 316
317What: Ability for non root users to shm_get hugetlb pages based on mlock
318 resource limits
319When: 2.6.31
320Why: Non root users need to be part of /proc/sys/vm/hugetlb_shm_group or
321 have CAP_IPC_LOCK to be able to allocate shm segments backed by
322 huge pages. The mlock based rlimit check to allow shm hugetlb is
323 inconsistent with mmap based allocations. Hence it is being
324 deprecated.
325Who: Ravikiran Thirumalai <kiran@scalex86.org>
326
327---------------------------
328
314What: CONFIG_THERMAL_HWMON 329What: CONFIG_THERMAL_HWMON
315When: January 2009 330When: January 2009
316Why: This option was introduced just to allow older lm-sensors userspace 331Why: This option was introduced just to allow older lm-sensors userspace
@@ -339,7 +354,8 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl>
339 354
340--------------------------- 355---------------------------
341 356
342What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client() 357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
358 i2c_adapter->client_register(), i2c_adapter->client_unregister
343When: 2.6.30 359When: 2.6.30
344Check: i2c_attach_client i2c_detach_client 360Check: i2c_attach_client i2c_detach_client
345Why: Deprecated by the new (standard) device driver binding model. Use 361Why: Deprecated by the new (standard) device driver binding model. Use
@@ -356,17 +372,6 @@ Who: Hans de Goede <hdegoede@redhat.com>
356 372
357--------------------------- 373---------------------------
358 374
359What: SELinux "compat_net" functionality
360When: 2.6.30 at the earliest
361Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net"
362 network access control functionality of SELinux. Secmark offers both
363 better performance and greater flexibility than the "compat_net"
364 mechanism. Now that the major Linux distributions have moved to
365 Secmark, it is time to deprecate the older mechanism and start the
366 process of removing the old code.
367Who: Paul Moore <paul.moore@hp.com>
368---------------------------
369
370What: sysfs ui for changing p4-clockmod parameters 375What: sysfs ui for changing p4-clockmod parameters
371When: September 2009 376When: September 2009
372Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and 377Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
@@ -391,3 +396,35 @@ Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
391 have been kept around for migration reasons. After more than two years 396 have been kept around for migration reasons. After more than two years
392 it's time to remove them finally 397 it's time to remove them finally
393Who: Thomas Gleixner <tglx@linutronix.de> 398Who: Thomas Gleixner <tglx@linutronix.de>
399
400---------------------------
401
402What: fakephp and associated sysfs files in /sys/bus/pci/slots/
403When: 2011
404Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
405 represent a machine's physical PCI slots. The change in semantics
406 had userspace implications, as the hotplug core no longer allowed
407 drivers to create multiple sysfs files per physical slot (required
408 for multi-function devices, e.g.). fakephp was seen as a developer's
409 tool only, and its interface changed. Too late, we learned that
410 there were some users of the fakephp interface.
411
412 In 2.6.30, the original fakephp interface was restored. At the same
413 time, the PCI core gained the ability that fakephp provided, namely
414 function-level hot-remove and hot-add.
415
416 Since the PCI core now provides the same functionality, exposed in:
417
418 /sys/bus/pci/rescan
419 /sys/bus/pci/devices/.../remove
420 /sys/bus/pci/devices/.../rescan
421
422 there is no functional reason to maintain fakephp as well.
423
424 We will keep the existing module so that 'modprobe fakephp' will
425 present the old /sys/bus/pci/slots/... interface for compatibility,
426 but users are urged to migrate their applications to the API above.
427
428 After a reasonable transition period, we will remove the legacy
429 fakephp interface.
430Who: Alex Chiang <achiang@hp.com>
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4e78ce677843..76efe5b71d7d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -505,7 +505,7 @@ prototypes:
505 void (*open)(struct vm_area_struct*); 505 void (*open)(struct vm_area_struct*);
506 void (*close)(struct vm_area_struct*); 506 void (*close)(struct vm_area_struct*);
507 int (*fault)(struct vm_area_struct*, struct vm_fault *); 507 int (*fault)(struct vm_area_struct*, struct vm_fault *);
508 int (*page_mkwrite)(struct vm_area_struct *, struct page *); 508 int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
509 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 509 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
510 510
511locking rules: 511locking rules:
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644
index 000000000000..382d52cdaf2d
--- /dev/null
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -0,0 +1,658 @@
1 ==========================
2 FS-CACHE CACHE BACKEND API
3 ==========================
4
5The FS-Cache system provides an API by which actual caches can be supplied to
6FS-Cache for it to then serve out to network filesystems and other interested
7parties.
8
9This API is declared in <linux/fscache-cache.h>.
10
11
12====================================
13INITIALISING AND REGISTERING A CACHE
14====================================
15
16To start off, a cache definition must be initialised and registered for each
17cache the backend wants to make available. For instance, CacheFS does this in
18the fill_super() operation on mounting.
19
20The cache definition (struct fscache_cache) should be initialised by calling:
21
22 void fscache_init_cache(struct fscache_cache *cache,
23 struct fscache_cache_ops *ops,
24 const char *idfmt,
25 ...);
26
27Where:
28
29 (*) "cache" is a pointer to the cache definition;
30
31 (*) "ops" is a pointer to the table of operations that the backend supports on
32 this cache; and
33
34 (*) "idfmt" is a format and printf-style arguments for constructing a label
35 for the cache.
36
37
38The cache should then be registered with FS-Cache by passing a pointer to the
39previously initialised cache definition to:
40
41 int fscache_add_cache(struct fscache_cache *cache,
42 struct fscache_object *fsdef,
43 const char *tagname);
44
45Two extra arguments should also be supplied:
46
47 (*) "fsdef" which should point to the object representation for the FS-Cache
48 master index in this cache. Netfs primary index entries will be created
49 here. FS-Cache keeps the caller's reference to the index object if
50 successful and will release it upon withdrawal of the cache.
51
52 (*) "tagname" which, if given, should be a text string naming this cache. If
53 this is NULL, the identifier will be used instead. For CacheFS, the
54 identifier is set to name the underlying block device and the tag can be
55 supplied by mount.
56
57This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
58is already in use. 0 will be returned on success.
59
60
61=====================
62UNREGISTERING A CACHE
63=====================
64
65A cache can be withdrawn from the system by calling this function with a
66pointer to the cache definition:
67
68 void fscache_withdraw_cache(struct fscache_cache *cache);
69
70In CacheFS's case, this is called by put_super().
71
72
73========
74SECURITY
75========
76
77The cache methods are executed one of two contexts:
78
79 (1) that of the userspace process that issued the netfs operation that caused
80 the cache method to be invoked, or
81
82 (2) that of one of the processes in the FS-Cache thread pool.
83
84In either case, this may not be an appropriate context in which to access the
85cache.
86
87The calling process's fsuid, fsgid and SELinux security identities may need to
88be masqueraded for the duration of the cache driver's access to the cache.
89This is left to the cache to handle; FS-Cache makes no effort in this regard.
90
91
92===================================
93CONTROL AND STATISTICS PRESENTATION
94===================================
95
96The cache may present data to the outside world through FS-Cache's interfaces
97in sysfs and procfs - the former for control and the latter for statistics.
98
99A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
100is enabled. This is accessible through the kobject struct fscache_cache::kobj
101and is for use by the cache as it sees fit.
102
103
104========================
105RELEVANT DATA STRUCTURES
106========================
107
108 (*) Index/Data file FS-Cache representation cookie:
109
110 struct fscache_cookie {
111 struct fscache_object_def *def;
112 struct fscache_netfs *netfs;
113 void *netfs_data;
114 ...
115 };
116
117 The fields that might be of use to the backend describe the object
118 definition, the netfs definition and the netfs's data for this cookie.
119 The object definition contain functions supplied by the netfs for loading
120 and matching index entries; these are required to provide some of the
121 cache operations.
122
123
124 (*) In-cache object representation:
125
126 struct fscache_object {
127 int debug_id;
128 enum {
129 FSCACHE_OBJECT_RECYCLING,
130 ...
131 } state;
132 spinlock_t lock
133 struct fscache_cache *cache;
134 struct fscache_cookie *cookie;
135 ...
136 };
137
138 Structures of this type should be allocated by the cache backend and
139 passed to FS-Cache when requested by the appropriate cache operation. In
140 the case of CacheFS, they're embedded in CacheFS's internal object
141 structures.
142
143 The debug_id is a simple integer that can be used in debugging messages
144 that refer to a particular object. In such a case it should be printed
145 using "OBJ%x" to be consistent with FS-Cache.
146
147 Each object contains a pointer to the cookie that represents the object it
148 is backing. An object should retired when put_object() is called if it is
149 in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be
150 initialised by calling fscache_object_init(object).
151
152
153 (*) FS-Cache operation record:
154
155 struct fscache_operation {
156 atomic_t usage;
157 struct fscache_object *object;
158 unsigned long flags;
159 #define FSCACHE_OP_EXCLUSIVE
160 void (*processor)(struct fscache_operation *op);
161 void (*release)(struct fscache_operation *op);
162 ...
163 };
164
165 FS-Cache has a pool of threads that it uses to give CPU time to the
166 various asynchronous operations that need to be done as part of driving
167 the cache. These are represented by the above structure. The processor
168 method is called to give the op CPU time, and the release method to get
169 rid of it when its usage count reaches 0.
170
171 An operation can be made exclusive upon an object by setting the
172 appropriate flag before enqueuing it with fscache_enqueue_operation(). If
173 an operation needs more processing time, it should be enqueued again.
174
175
176 (*) FS-Cache retrieval operation record:
177
178 struct fscache_retrieval {
179 struct fscache_operation op;
180 struct address_space *mapping;
181 struct list_head *to_do;
182 ...
183 };
184
185 A structure of this type is allocated by FS-Cache to record retrieval and
186 allocation requests made by the netfs. This struct is then passed to the
187 backend to do the operation. The backend may get extra refs to it by
188 calling fscache_get_retrieval() and refs may be discarded by calling
189 fscache_put_retrieval().
190
191 A retrieval operation can be used by the backend to do retrieval work. To
192 do this, the retrieval->op.processor method pointer should be set
193 appropriately by the backend and fscache_enqueue_retrieval() called to
194 submit it to the thread pool. CacheFiles, for example, uses this to queue
195 page examination when it detects PG_lock being cleared.
196
197 The to_do field is an empty list available for the cache backend to use as
198 it sees fit.
199
200
201 (*) FS-Cache storage operation record:
202
203 struct fscache_storage {
204 struct fscache_operation op;
205 pgoff_t store_limit;
206 ...
207 };
208
209 A structure of this type is allocated by FS-Cache to record outstanding
210 writes to be made. FS-Cache itself enqueues this operation and invokes
211 the write_page() method on the object at appropriate times to effect
212 storage.
213
214
215================
216CACHE OPERATIONS
217================
218
219The cache backend provides FS-Cache with a table of operations that can be
220performed on the denizens of the cache. These are held in a structure of type:
221
222 struct fscache_cache_ops
223
224 (*) Name of cache provider [mandatory]:
225
226 const char *name
227
228 This isn't strictly an operation, but should be pointed at a string naming
229 the backend.
230
231
232 (*) Allocate a new object [mandatory]:
233
234 struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
235 struct fscache_cookie *cookie)
236
237 This method is used to allocate a cache object representation to back a
238 cookie in a particular cache. fscache_object_init() should be called on
239 the object to initialise it prior to returning.
240
241 This function may also be used to parse the index key to be used for
242 multiple lookup calls to turn it into a more convenient form. FS-Cache
243 will call the lookup_complete() method to allow the cache to release the
244 form once lookup is complete or aborted.
245
246
247 (*) Look up and create object [mandatory]:
248
249 void (*lookup_object)(struct fscache_object *object)
250
251 This method is used to look up an object, given that the object is already
252 allocated and attached to the cookie. This should instantiate that object
253 in the cache if it can.
254
255 The method should call fscache_object_lookup_negative() as soon as
256 possible if it determines the object doesn't exist in the cache. If the
257 object is found to exist and the netfs indicates that it is valid then
258 fscache_obtained_object() should be called once the object is in a
259 position to have data stored in it. Similarly, fscache_obtained_object()
260 should also be called once a non-present object has been created.
261
262 If a lookup error occurs, fscache_object_lookup_error() should be called
263 to abort the lookup of that object.
264
265
266 (*) Release lookup data [mandatory]:
267
268 void (*lookup_complete)(struct fscache_object *object)
269
270 This method is called to ask the cache to release any resources it was
271 using to perform a lookup.
272
273
274 (*) Increment object refcount [mandatory]:
275
276 struct fscache_object *(*grab_object)(struct fscache_object *object)
277
278 This method is called to increment the reference count on an object. It
279 may fail (for instance if the cache is being withdrawn) by returning NULL.
280 It should return the object pointer if successful.
281
282
283 (*) Lock/Unlock object [mandatory]:
284
285 void (*lock_object)(struct fscache_object *object)
286 void (*unlock_object)(struct fscache_object *object)
287
288 These methods are used to exclusively lock an object. It must be possible
289 to schedule with the lock held, so a spinlock isn't sufficient.
290
291
292 (*) Pin/Unpin object [optional]:
293
294 int (*pin_object)(struct fscache_object *object)
295 void (*unpin_object)(struct fscache_object *object)
296
297 These methods are used to pin an object into the cache. Once pinned an
298 object cannot be reclaimed to make space. Return -ENOSPC if there's not
299 enough space in the cache to permit this.
300
301
302 (*) Update object [mandatory]:
303
304 int (*update_object)(struct fscache_object *object)
305
306 This is called to update the index entry for the specified object. The
307 new information should be in object->cookie->netfs_data. This can be
308 obtained by calling object->cookie->def->get_aux()/get_attr().
309
310
311 (*) Discard object [mandatory]:
312
313 void (*drop_object)(struct fscache_object *object)
314
315 This method is called to indicate that an object has been unbound from its
316 cookie, and that the cache should release the object's resources and
317 retire it if it's in state FSCACHE_OBJECT_RECYCLING.
318
319 This method should not attempt to release any references held by the
320 caller. The caller will invoke the put_object() method as appropriate.
321
322
323 (*) Release object reference [mandatory]:
324
325 void (*put_object)(struct fscache_object *object)
326
327 This method is used to discard a reference to an object. The object may
328 be freed when all the references to it are released.
329
330
331 (*) Synchronise a cache [mandatory]:
332
333 void (*sync)(struct fscache_cache *cache)
334
335 This is called to ask the backend to synchronise a cache with its backing
336 device.
337
338
339 (*) Dissociate a cache [mandatory]:
340
341 void (*dissociate_pages)(struct fscache_cache *cache)
342
343 This is called to ask a cache to perform any page dissociations as part of
344 cache withdrawal.
345
346
347 (*) Notification that the attributes on a netfs file changed [mandatory]:
348
349 int (*attr_changed)(struct fscache_object *object);
350
351 This is called to indicate to the cache that certain attributes on a netfs
352 file have changed (for example the maximum size a file may reach). The
353 cache can read these from the netfs by calling the cookie's get_attr()
354 method.
355
356 The cache may use the file size information to reserve space on the cache.
357 It should also call fscache_set_store_limit() to indicate to FS-Cache the
358 highest byte it's willing to store for an object.
359
360 This method may return -ve if an error occurred or the cache object cannot
361 be expanded. In such a case, the object will be withdrawn from service.
362
363 This operation is run asynchronously from FS-Cache's thread pool, and
364 storage and retrieval operations from the netfs are excluded during the
365 execution of this operation.
366
367
368 (*) Reserve cache space for an object's data [optional]:
369
370 int (*reserve_space)(struct fscache_object *object, loff_t size);
371
372 This is called to request that cache space be reserved to hold the data
373 for an object and the metadata used to track it. Zero size should be
374 taken as request to cancel a reservation.
375
376 This should return 0 if successful, -ENOSPC if there isn't enough space
377 available, or -ENOMEM or -EIO on other errors.
378
379 The reservation may exceed the current size of the object, thus permitting
380 future expansion. If the amount of space consumed by an object would
381 exceed the reservation, it's permitted to refuse requests to allocate
382 pages, but not required. An object may be pruned down to its reservation
383 size if larger than that already.
384
385
386 (*) Request page be read from cache [mandatory]:
387
388 int (*read_or_alloc_page)(struct fscache_retrieval *op,
389 struct page *page,
390 gfp_t gfp)
391
392 This is called to attempt to read a netfs page from the cache, or to
393 reserve a backing block if not. FS-Cache will have done as much checking
394 as it can before calling, but most of the work belongs to the backend.
395
396 If there's no page in the cache, then -ENODATA should be returned if the
397 backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
398 didn't.
399
400 If there is suitable data in the cache, then a read operation should be
401 queued and 0 returned. When the read finishes, fscache_end_io() should be
402 called.
403
404 The fscache_mark_pages_cached() should be called for the page if any cache
405 metadata is retained. This will indicate to the netfs that the page needs
406 explicit uncaching. This operation takes a pagevec, thus allowing several
407 pages to be marked at once.
408
409 The retrieval record pointed to by op should be retained for each page
410 queued and released when I/O on the page has been formally ended.
411 fscache_get/put_retrieval() are available for this purpose.
412
413 The retrieval record may be used to get CPU time via the FS-Cache thread
414 pool. If this is desired, the op->op.processor should be set to point to
415 the appropriate processing routine, and fscache_enqueue_retrieval() should
416 be called at an appropriate point to request CPU time. For instance, the
417 retrieval routine could be enqueued upon the completion of a disk read.
418 The to_do field in the retrieval record is provided to aid in this.
419
420 If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
421 returned if possible or fscache_end_io() called with a suitable error
422 code..
423
424
425 (*) Request pages be read from cache [mandatory]:
426
427 int (*read_or_alloc_pages)(struct fscache_retrieval *op,
428 struct list_head *pages,
429 unsigned *nr_pages,
430 gfp_t gfp)
431
432 This is like the read_or_alloc_page() method, except it is handed a list
433 of pages instead of one page. Any pages on which a read operation is
434 started must be added to the page cache for the specified mapping and also
435 to the LRU. Such pages must also be removed from the pages list and
436 *nr_pages decremented per page.
437
438 If there was an error such as -ENOMEM, then that should be returned; else
439 if one or more pages couldn't be read or allocated, then -ENOBUFS should
440 be returned; else if one or more pages couldn't be read, then -ENODATA
441 should be returned. If all the pages are dispatched then 0 should be
442 returned.
443
444
445 (*) Request page be allocated in the cache [mandatory]:
446
447 int (*allocate_page)(struct fscache_retrieval *op,
448 struct page *page,
449 gfp_t gfp)
450
451 This is like the read_or_alloc_page() method, except that it shouldn't
452 read from the cache, even if there's data there that could be retrieved.
453 It should, however, set up any internal metadata required such that
454 the write_page() method can write to the cache.
455
456 If there's no backing block available, then -ENOBUFS should be returned
457 (or -ENOMEM if there were other problems). If a block is successfully
458 allocated, then the netfs page should be marked and 0 returned.
459
460
461 (*) Request pages be allocated in the cache [mandatory]:
462
463 int (*allocate_pages)(struct fscache_retrieval *op,
464 struct list_head *pages,
465 unsigned *nr_pages,
466 gfp_t gfp)
467
468 This is an multiple page version of the allocate_page() method. pages and
469 nr_pages should be treated as for the read_or_alloc_pages() method.
470
471
472 (*) Request page be written to cache [mandatory]:
473
474 int (*write_page)(struct fscache_storage *op,
475 struct page *page);
476
477 This is called to write from a page on which there was a previously
478 successful read_or_alloc_page() call or similar. FS-Cache filters out
479 pages that don't have mappings.
480
481 This method is called asynchronously from the FS-Cache thread pool. It is
482 not required to actually store anything, provided -ENODATA is then
483 returned to the next read of this page.
484
485 If an error occurred, then a negative error code should be returned,
486 otherwise zero should be returned. FS-Cache will take appropriate action
487 in response to an error, such as withdrawing this object.
488
489 If this method returns success then FS-Cache will inform the netfs
490 appropriately.
491
492
493 (*) Discard retained per-page metadata [mandatory]:
494
495 void (*uncache_page)(struct fscache_object *object, struct page *page)
496
497 This is called when a netfs page is being evicted from the pagecache. The
498 cache backend should tear down any internal representation or tracking it
499 maintains for this page.
500
501
502==================
503FS-CACHE UTILITIES
504==================
505
506FS-Cache provides some utilities that a cache backend may make use of:
507
508 (*) Note occurrence of an I/O error in a cache:
509
510 void fscache_io_error(struct fscache_cache *cache)
511
512 This tells FS-Cache that an I/O error occurred in the cache. After this
513 has been called, only resource dissociation operations (object and page
514 release) will be passed from the netfs to the cache backend for the
515 specified cache.
516
517 This does not actually withdraw the cache. That must be done separately.
518
519
520 (*) Invoke the retrieval I/O completion function:
521
522 void fscache_end_io(struct fscache_retrieval *op, struct page *page,
523 int error);
524
525 This is called to note the end of an attempt to retrieve a page. The
526 error value should be 0 if successful and an error otherwise.
527
528
529 (*) Set highest store limit:
530
531 void fscache_set_store_limit(struct fscache_object *object,
532 loff_t i_size);
533
534 This sets the limit FS-Cache imposes on the highest byte it's willing to
535 try and store for a netfs. Any page over this limit is automatically
536 rejected by fscache_read_alloc_page() and co with -ENOBUFS.
537
538
539 (*) Mark pages as being cached:
540
541 void fscache_mark_pages_cached(struct fscache_retrieval *op,
542 struct pagevec *pagevec);
543
544 This marks a set of pages as being cached. After this has been called,
545 the netfs must call fscache_uncache_page() to unmark the pages.
546
547
548 (*) Perform coherency check on an object:
549
550 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
551 const void *data,
552 uint16_t datalen);
553
554 This asks the netfs to perform a coherency check on an object that has
555 just been looked up. The cookie attached to the object will determine the
556 netfs to use. data and datalen should specify where the auxiliary data
557 retrieved from the cache can be found.
558
559 One of three values will be returned:
560
561 (*) FSCACHE_CHECKAUX_OKAY
562
563 The coherency data indicates the object is valid as is.
564
565 (*) FSCACHE_CHECKAUX_NEEDS_UPDATE
566
567 The coherency data needs updating, but otherwise the object is
568 valid.
569
570 (*) FSCACHE_CHECKAUX_OBSOLETE
571
572 The coherency data indicates that the object is obsolete and should
573 be discarded.
574
575
576 (*) Initialise a freshly allocated object:
577
578 void fscache_object_init(struct fscache_object *object);
579
580 This initialises all the fields in an object representation.
581
582
583 (*) Indicate the destruction of an object:
584
585 void fscache_object_destroyed(struct fscache_cache *cache);
586
587 This must be called to inform FS-Cache that an object that belonged to a
588 cache has been destroyed and deallocated. This will allow continuation
589 of the cache withdrawal process when it is stopped pending destruction of
590 all the objects.
591
592
593 (*) Indicate negative lookup on an object:
594
595 void fscache_object_lookup_negative(struct fscache_object *object);
596
597 This is called to indicate to FS-Cache that a lookup process for an object
598 found a negative result.
599
600 This changes the state of an object to permit reads pending on lookup
601 completion to go off and start fetching data from the netfs server as it's
602 known at this point that there can't be any data in the cache.
603
604 This may be called multiple times on an object. Only the first call is
605 significant - all subsequent calls are ignored.
606
607
608 (*) Indicate an object has been obtained:
609
610 void fscache_obtained_object(struct fscache_object *object);
611
612 This is called to indicate to FS-Cache that a lookup process for an object
613 produced a positive result, or that an object was created. This should
614 only be called once for any particular object.
615
616 This changes the state of an object to indicate:
617
618 (1) if no call to fscache_object_lookup_negative() has been made on
619 this object, that there may be data available, and that reads can
620 now go and look for it; and
621
622 (2) that writes may now proceed against this object.
623
624
625 (*) Indicate that object lookup failed:
626
627 void fscache_object_lookup_error(struct fscache_object *object);
628
629 This marks an object as having encountered a fatal error (usually EIO)
630 and causes it to move into a state whereby it will be withdrawn as soon
631 as possible.
632
633
634 (*) Get and release references on a retrieval record:
635
636 void fscache_get_retrieval(struct fscache_retrieval *op);
637 void fscache_put_retrieval(struct fscache_retrieval *op);
638
639 These two functions are used to retain a retrieval record whilst doing
640 asynchronous data retrieval and block allocation.
641
642
643 (*) Enqueue a retrieval record for processing.
644
645 void fscache_enqueue_retrieval(struct fscache_retrieval *op);
646
647 This enqueues a retrieval record for processing by the FS-Cache thread
648 pool. One of the threads in the pool will invoke the retrieval record's
649 op->op.processor callback function. This function may be called from
650 within the callback function.
651
652
653 (*) List of object state names:
654
655 const char *fscache_object_states[];
656
657 For debugging purposes, this may be used to turn the state that an object
658 is in into a text string for display purposes.
diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt
new file mode 100644
index 000000000000..c78a49b7bba6
--- /dev/null
+++ b/Documentation/filesystems/caching/cachefiles.txt
@@ -0,0 +1,501 @@
1 ===============================================
2 CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM
3 ===============================================
4
5Contents:
6
7 (*) Overview.
8
9 (*) Requirements.
10
11 (*) Configuration.
12
13 (*) Starting the cache.
14
15 (*) Things to avoid.
16
17 (*) Cache culling.
18
19 (*) Cache structure.
20
21 (*) Security model and SELinux.
22
23 (*) A note on security.
24
25 (*) Statistical information.
26
27 (*) Debugging.
28
29
30========
31OVERVIEW
32========
33
34CacheFiles is a caching backend that's meant to use as a cache a directory on
35an already mounted filesystem of a local type (such as Ext3).
36
37CacheFiles uses a userspace daemon to do some of the cache management - such as
38reaping stale nodes and culling. This is called cachefilesd and lives in
39/sbin.
40
41The filesystem and data integrity of the cache are only as good as those of the
42filesystem providing the backing services. Note that CacheFiles does not
43attempt to journal anything since the journalling interfaces of the various
44filesystems are very specific in nature.
45
46CacheFiles creates a misc character device - "/dev/cachefiles" - that is used
47to communication with the daemon. Only one thing may have this open at once,
48and whilst it is open, a cache is at least partially in existence. The daemon
49opens this and sends commands down it to control the cache.
50
51CacheFiles is currently limited to a single cache.
52
53CacheFiles attempts to maintain at least a certain percentage of free space on
54the filesystem, shrinking the cache by culling the objects it contains to make
55space if necessary - see the "Cache Culling" section. This means it can be
56placed on the same medium as a live set of data, and will expand to make use of
57spare space and automatically contract when the set of data requires more
58space.
59
60
61============
62REQUIREMENTS
63============
64
65The use of CacheFiles and its daemon requires the following features to be
66available in the system and in the cache filesystem:
67
68 - dnotify.
69
70 - extended attributes (xattrs).
71
72 - openat() and friends.
73
74 - bmap() support on files in the filesystem (FIBMAP ioctl).
75
76 - The use of bmap() to detect a partial page at the end of the file.
77
78It is strongly recommended that the "dir_index" option is enabled on Ext3
79filesystems being used as a cache.
80
81
82=============
83CONFIGURATION
84=============
85
86The cache is configured by a script in /etc/cachefilesd.conf. These commands
87set up cache ready for use. The following script commands are available:
88
89 (*) brun <N>%
90 (*) bcull <N>%
91 (*) bstop <N>%
92 (*) frun <N>%
93 (*) fcull <N>%
94 (*) fstop <N>%
95
96 Configure the culling limits. Optional. See the section on culling
97 The defaults are 7% (run), 5% (cull) and 1% (stop) respectively.
98
99 The commands beginning with a 'b' are file space (block) limits, those
100 beginning with an 'f' are file count limits.
101
102 (*) dir <path>
103
104 Specify the directory containing the root of the cache. Mandatory.
105
106 (*) tag <name>
107
108 Specify a tag to FS-Cache to use in distinguishing multiple caches.
109 Optional. The default is "CacheFiles".
110
111 (*) debug <mask>
112
113 Specify a numeric bitmask to control debugging in the kernel module.
114 Optional. The default is zero (all off). The following values can be
115 OR'd into the mask to collect various information:
116
117 1 Turn on trace of function entry (_enter() macros)
118 2 Turn on trace of function exit (_leave() macros)
119 4 Turn on trace of internal debug points (_debug())
120
121 This mask can also be set through sysfs, eg:
122
123 echo 5 >/sys/modules/cachefiles/parameters/debug
124
125
126==================
127STARTING THE CACHE
128==================
129
130The cache is started by running the daemon. The daemon opens the cache device,
131configures the cache and tells it to begin caching. At that point the cache
132binds to fscache and the cache becomes live.
133
134The daemon is run as follows:
135
136 /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>]
137
138The flags are:
139
140 (*) -d
141
142 Increase the debugging level. This can be specified multiple times and
143 is cumulative with itself.
144
145 (*) -s
146
147 Send messages to stderr instead of syslog.
148
149 (*) -n
150
151 Don't daemonise and go into background.
152
153 (*) -f <configfile>
154
155 Use an alternative configuration file rather than the default one.
156
157
158===============
159THINGS TO AVOID
160===============
161
162Do not mount other things within the cache as this will cause problems. The
163kernel module contains its own very cut-down path walking facility that ignores
164mountpoints, but the daemon can't avoid them.
165
166Do not create, rename or unlink files and directories in the cache whilst the
167cache is active, as this may cause the state to become uncertain.
168
169Renaming files in the cache might make objects appear to be other objects (the
170filename is part of the lookup key).
171
172Do not change or remove the extended attributes attached to cache files by the
173cache as this will cause the cache state management to get confused.
174
175Do not create files or directories in the cache, lest the cache get confused or
176serve incorrect data.
177
178Do not chmod files in the cache. The module creates things with minimal
179permissions to prevent random users being able to access them directly.
180
181
182=============
183CACHE CULLING
184=============
185
186The cache may need culling occasionally to make space. This involves
187discarding objects from the cache that have been used less recently than
188anything else. Culling is based on the access time of data objects. Empty
189directories are culled if not in use.
190
191Cache culling is done on the basis of the percentage of blocks and the
192percentage of files available in the underlying filesystem. There are six
193"limits":
194
195 (*) brun
196 (*) frun
197
198 If the amount of free space and the number of available files in the cache
199 rises above both these limits, then culling is turned off.
200
201 (*) bcull
202 (*) fcull
203
204 If the amount of available space or the number of available files in the
205 cache falls below either of these limits, then culling is started.
206
207 (*) bstop
208 (*) fstop
209
210 If the amount of available space or the number of available files in the
211 cache falls below either of these limits, then no further allocation of
212 disk space or files is permitted until culling has raised things above
213 these limits again.
214
215These must be configured thusly:
216
217 0 <= bstop < bcull < brun < 100
218 0 <= fstop < fcull < frun < 100
219
220Note that these are percentages of available space and available files, and do
221_not_ appear as 100 minus the percentage displayed by the "df" program.
222
223The userspace daemon scans the cache to build up a table of cullable objects.
224These are then culled in least recently used order. A new scan of the cache is
225started as soon as space is made in the table. Objects will be skipped if
226their atimes have changed or if the kernel module says it is still using them.
227
228
229===============
230CACHE STRUCTURE
231===============
232
233The CacheFiles module will create two directories in the directory it was
234given:
235
236 (*) cache/
237
238 (*) graveyard/
239
240The active cache objects all reside in the first directory. The CacheFiles
241kernel module moves any retired or culled objects that it can't simply unlink
242to the graveyard from which the daemon will actually delete them.
243
244The daemon uses dnotify to monitor the graveyard directory, and will delete
245anything that appears therein.
246
247
248The module represents index objects as directories with the filename "I..." or
249"J...". Note that the "cache/" directory is itself a special index.
250
251Data objects are represented as files if they have no children, or directories
252if they do. Their filenames all begin "D..." or "E...". If represented as a
253directory, data objects will have a file in the directory called "data" that
254actually holds the data.
255
256Special objects are similar to data objects, except their filenames begin
257"S..." or "T...".
258
259
260If an object has children, then it will be represented as a directory.
261Immediately in the representative directory are a collection of directories
262named for hash values of the child object keys with an '@' prepended. Into
263this directory, if possible, will be placed the representations of the child
264objects:
265
266 INDEX INDEX INDEX DATA FILES
267 ========= ========== ================================= ================
268 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400
269 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry
270 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry
271 cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry
272
273
274If the key is so long that it exceeds NAME_MAX with the decorations added on to
275it, then it will be cut into pieces, the first few of which will be used to
276make a nest of directories, and the last one of which will be the objects
277inside the last directory. The names of the intermediate directories will have
278'+' prepended:
279
280 J1223/@23/+xy...z/+kl...m/Epqr
281
282
283Note that keys are raw data, and not only may they exceed NAME_MAX in size,
284they may also contain things like '/' and NUL characters, and so they may not
285be suitable for turning directly into a filename.
286
287To handle this, CacheFiles will use a suitably printable filename directly and
288"base-64" encode ones that aren't directly suitable. The two versions of
289object filenames indicate the encoding:
290
291 OBJECT TYPE PRINTABLE ENCODED
292 =============== =============== ===============
293 Index "I..." "J..."
294 Data "D..." "E..."
295 Special "S..." "T..."
296
297Intermediate directories are always "@" or "+" as appropriate.
298
299
300Each object in the cache has an extended attribute label that holds the object
301type ID (required to distinguish special objects) and the auxiliary data from
302the netfs. The latter is used to detect stale objects in the cache and update
303or retire them.
304
305
306Note that CacheFiles will erase from the cache any file it doesn't recognise or
307any file of an incorrect type (such as a FIFO file or a device file).
308
309
310==========================
311SECURITY MODEL AND SELINUX
312==========================
313
314CacheFiles is implemented to deal properly with the LSM security features of
315the Linux kernel and the SELinux facility.
316
317One of the problems that CacheFiles faces is that it is generally acting on
318behalf of a process, and running in that process's context, and that includes a
319security context that is not appropriate for accessing the cache - either
320because the files in the cache are inaccessible to that process, or because if
321the process creates a file in the cache, that file may be inaccessible to other
322processes.
323
324The way CacheFiles works is to temporarily change the security context (fsuid,
325fsgid and actor security label) that the process acts as - without changing the
326security context of the process when it the target of an operation performed by
327some other process (so signalling and suchlike still work correctly).
328
329
330When the CacheFiles module is asked to bind to its cache, it:
331
332 (1) Finds the security label attached to the root cache directory and uses
333 that as the security label with which it will create files. By default,
334 this is:
335
336 cachefiles_var_t
337
338 (2) Finds the security label of the process which issued the bind request
339 (presumed to be the cachefilesd daemon), which by default will be:
340
341 cachefilesd_t
342
343 and asks LSM to supply a security ID as which it should act given the
344 daemon's label. By default, this will be:
345
346 cachefiles_kernel_t
347
348 SELinux transitions the daemon's security ID to the module's security ID
349 based on a rule of this form in the policy.
350
351 type_transition <daemon's-ID> kernel_t : process <module's-ID>;
352
353 For instance:
354
355 type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t;
356
357
358The module's security ID gives it permission to create, move and remove files
359and directories in the cache, to find and access directories and files in the
360cache, to set and access extended attributes on cache objects, and to read and
361write files in the cache.
362
363The daemon's security ID gives it only a very restricted set of permissions: it
364may scan directories, stat files and erase files and directories. It may
365not read or write files in the cache, and so it is precluded from accessing the
366data cached therein; nor is it permitted to create new files in the cache.
367
368
369There are policy source files available in:
370
371 http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2
372
373and later versions. In that tarball, see the files:
374
375 cachefilesd.te
376 cachefilesd.fc
377 cachefilesd.if
378
379They are built and installed directly by the RPM.
380
381If a non-RPM based system is being used, then copy the above files to their own
382directory and run:
383
384 make -f /usr/share/selinux/devel/Makefile
385 semodule -i cachefilesd.pp
386
387You will need checkpolicy and selinux-policy-devel installed prior to the
388build.
389
390
391By default, the cache is located in /var/fscache, but if it is desirable that
392it should be elsewhere, than either the above policy files must be altered, or
393an auxiliary policy must be installed to label the alternate location of the
394cache.
395
396For instructions on how to add an auxiliary policy to enable the cache to be
397located elsewhere when SELinux is in enforcing mode, please see:
398
399 /usr/share/doc/cachefilesd-*/move-cache.txt
400
401When the cachefilesd rpm is installed; alternatively, the document can be found
402in the sources.
403
404
405==================
406A NOTE ON SECURITY
407==================
408
409CacheFiles makes use of the split security in the task_struct. It allocates
410its own task_security structure, and redirects current->act_as to point to it
411when it acts on behalf of another process, in that process's context.
412
413The reason it does this is that it calls vfs_mkdir() and suchlike rather than
414bypassing security and calling inode ops directly. Therefore the VFS and LSM
415may deny the CacheFiles access to the cache data because under some
416circumstances the caching code is running in the security context of whatever
417process issued the original syscall on the netfs.
418
419Furthermore, should CacheFiles create a file or directory, the security
420parameters with that object is created (UID, GID, security label) would be
421derived from that process that issued the system call, thus potentially
422preventing other processes from accessing the cache - including CacheFiles's
423cache management daemon (cachefilesd).
424
425What is required is to temporarily override the security of the process that
426issued the system call. We can't, however, just do an in-place change of the
427security data as that affects the process as an object, not just as a subject.
428This means it may lose signals or ptrace events for example, and affects what
429the process looks like in /proc.
430
431So CacheFiles makes use of a logical split in the security between the
432objective security (task->sec) and the subjective security (task->act_as). The
433objective security holds the intrinsic security properties of a process and is
434never overridden. This is what appears in /proc, and is what is used when a
435process is the target of an operation by some other process (SIGKILL for
436example).
437
438The subjective security holds the active security properties of a process, and
439may be overridden. This is not seen externally, and is used whan a process
440acts upon another object, for example SIGKILLing another process or opening a
441file.
442
443LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request
444for CacheFiles to run in a context of a specific security label, or to create
445files and directories with another security label.
446
447
448=======================
449STATISTICAL INFORMATION
450=======================
451
452If FS-Cache is compiled with the following option enabled:
453
454 CONFIG_CACHEFILES_HISTOGRAM=y
455
456then it will gather certain statistics and display them through a proc file.
457
458 (*) /proc/fs/cachefiles/histogram
459
460 cat /proc/fs/cachefiles/histogram
461 JIFS SECS LOOKUPS MKDIRS CREATES
462 ===== ===== ========= ========= =========
463
464 This shows the breakdown of the number of times each amount of time
465 between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
466 columns are as follows:
467
468 COLUMN TIME MEASUREMENT
469 ======= =======================================================
470 LOOKUPS Length of time to perform a lookup on the backing fs
471 MKDIRS Length of time to perform a mkdir on the backing fs
472 CREATES Length of time to perform a create on the backing fs
473
474 Each row shows the number of events that took a particular range of times.
475 Each step is 1 jiffy in size. The JIFS column indicates the particular
476 jiffy range covered, and the SECS field the equivalent number of seconds.
477
478
479=========
480DEBUGGING
481=========
482
483If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime
484debugging enabled by adjusting the value in:
485
486 /sys/module/cachefiles/parameters/debug
487
488This is a bitmask of debugging streams to enable:
489
490 BIT VALUE STREAM POINT
491 ======= ======= =============================== =======================
492 0 1 General Function entry trace
493 1 2 Function exit trace
494 2 4 General
495
496The appropriate set of values should be OR'd together and the result written to
497the control file. For example:
498
499 echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
500
501will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644
index 000000000000..9e94b9491d89
--- /dev/null
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -0,0 +1,333 @@
1 ==========================
2 General Filesystem Caching
3 ==========================
4
5========
6OVERVIEW
7========
8
9This facility is a general purpose cache for network filesystems, though it
10could be used for caching other things such as ISO9660 filesystems too.
11
12FS-Cache mediates between cache backends (such as CacheFS) and network
13filesystems:
14
15 +---------+
16 | | +--------------+
17 | NFS |--+ | |
18 | | | +-->| CacheFS |
19 +---------+ | +----------+ | | /dev/hda5 |
20 | | | | +--------------+
21 +---------+ +-->| | |
22 | | | |--+
23 | AFS |----->| FS-Cache |
24 | | | |--+
25 +---------+ +-->| | |
26 | | | | +--------------+
27 +---------+ | +----------+ | | |
28 | | | +-->| CacheFiles |
29 | ISOFS |--+ | /var/cache |
30 | | +--------------+
31 +---------+
32
33Or to look at it another way, FS-Cache is a module that provides a caching
34facility to a network filesystem such that the cache is transparent to the
35user:
36
37 +---------+
38 | |
39 | Server |
40 | |
41 +---------+
42 | NETWORK
43 ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
44 |
45 | +----------+
46 V | |
47 +---------+ | |
48 | | | |
49 | NFS |----->| FS-Cache |
50 | | | |--+
51 +---------+ | | | +--------------+ +--------------+
52 | | | | | | | |
53 V +----------+ +-->| CacheFiles |-->| Ext3 |
54 +---------+ | /var/cache | | /dev/sda6 |
55 | | +--------------+ +--------------+
56 | VFS | ^ ^
57 | | | |
58 +---------+ +--------------+ |
59 | KERNEL SPACE | |
60 ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
61 | USER SPACE | |
62 V | |
63 +---------+ +--------------+
64 | | | |
65 | Process | | cachefilesd |
66 | | | |
67 +---------+ +--------------+
68
69
70FS-Cache does not follow the idea of completely loading every netfs file
71opened in its entirety into a cache before permitting it to be accessed and
72then serving the pages out of that cache rather than the netfs inode because:
73
74 (1) It must be practical to operate without a cache.
75
76 (2) The size of any accessible file must not be limited to the size of the
77 cache.
78
79 (3) The combined size of all opened files (this includes mapped libraries)
80 must not be limited to the size of the cache.
81
82 (4) The user should not be forced to download an entire file just to do a
83 one-off access of a small portion of it (such as might be done with the
84 "file" program).
85
86It instead serves the cache out in PAGE_SIZE chunks as and when requested by
87the netfs('s) using it.
88
89
90FS-Cache provides the following facilities:
91
92 (1) More than one cache can be used at once. Caches can be selected
93 explicitly by use of tags.
94
95 (2) Caches can be added / removed at any time.
96
97 (3) The netfs is provided with an interface that allows either party to
98 withdraw caching facilities from a file (required for (2)).
99
100 (4) The interface to the netfs returns as few errors as possible, preferring
101 rather to let the netfs remain oblivious.
102
103 (5) Cookies are used to represent indices, files and other objects to the
104 netfs. The simplest cookie is just a NULL pointer - indicating nothing
105 cached there.
106
107 (6) The netfs is allowed to propose - dynamically - any index hierarchy it
108 desires, though it must be aware that the index search function is
109 recursive, stack space is limited, and indices can only be children of
110 indices.
111
112 (7) Data I/O is done direct to and from the netfs's pages. The netfs
113 indicates that page A is at index B of the data-file represented by cookie
114 C, and that it should be read or written. The cache backend may or may
115 not start I/O on that page, but if it does, a netfs callback will be
116 invoked to indicate completion. The I/O may be either synchronous or
117 asynchronous.
118
119 (8) Cookies can be "retired" upon release. At this point FS-Cache will mark
120 them as obsolete and the index hierarchy rooted at that point will get
121 recycled.
122
123 (9) The netfs provides a "match" function for index searches. In addition to
124 saying whether a match was made or not, this can also specify that an
125 entry should be updated or deleted.
126
127(10) As much as possible is done asynchronously.
128
129
130FS-Cache maintains a virtual indexing tree in which all indices, files, objects
131and pages are kept. Bits of this tree may actually reside in one or more
132caches.
133
134 FSDEF
135 |
136 +------------------------------------+
137 | |
138 NFS AFS
139 | |
140 +--------------------------+ +-----------+
141 | | | |
142 homedir mirror afs.org redhat.com
143 | | |
144 +------------+ +---------------+ +----------+
145 | | | | | |
146 00001 00002 00007 00125 vol00001 vol00002
147 | | | | |
148 +---+---+ +-----+ +---+ +------+------+ +-----+----+
149 | | | | | | | | | | | | |
150PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak
151 | |
152 PG0 +-------+
153 | |
154 00001 00003
155 |
156 +---+---+
157 | | |
158 PG0 PG1 PG2
159
160In the example above, you can see two netfs's being backed: NFS and AFS. These
161have different index hierarchies:
162
163 (*) The NFS primary index contains per-server indices. Each server index is
164 indexed by NFS file handles to get data file objects. Each data file
165 objects can have an array of pages, but may also have further child
166 objects, such as extended attributes and directory entries. Extended
167 attribute objects themselves have page-array contents.
168
169 (*) The AFS primary index contains per-cell indices. Each cell index contains
170 per-logical-volume indices. Each of volume index contains up to three
171 indices for the read-write, read-only and backup mirrors of those volumes.
172 Each of these contains vnode data file objects, each of which contains an
173 array of pages.
174
175The very top index is the FS-Cache master index in which individual netfs's
176have entries.
177
178Any index object may reside in more than one cache, provided it only has index
179children. Any index with non-index object children will be assumed to only
180reside in one cache.
181
182
183The netfs API to FS-Cache can be found in:
184
185 Documentation/filesystems/caching/netfs-api.txt
186
187The cache backend API to FS-Cache can be found in:
188
189 Documentation/filesystems/caching/backend-api.txt
190
191A description of the internal representations and object state machine can be
192found in:
193
194 Documentation/filesystems/caching/object.txt
195
196
197=======================
198STATISTICAL INFORMATION
199=======================
200
201If FS-Cache is compiled with the following options enabled:
202
203 CONFIG_FSCACHE_STATS=y
204 CONFIG_FSCACHE_HISTOGRAM=y
205
206then it will gather certain statistics and display them through a number of
207proc files.
208
209 (*) /proc/fs/fscache/stats
210
211 This shows counts of a number of events that can happen in FS-Cache:
212
213 CLASS EVENT MEANING
214 ======= ======= =======================================================
215 Cookies idx=N Number of index cookies allocated
216 dat=N Number of data storage cookies allocated
217 spc=N Number of special cookies allocated
218 Objects alc=N Number of objects allocated
219 nal=N Number of object allocation failures
220 avl=N Number of objects that reached the available state
221 ded=N Number of objects that reached the dead state
222 ChkAux non=N Number of objects that didn't have a coherency check
223 ok=N Number of objects that passed a coherency check
224 upd=N Number of objects that needed a coherency data update
225 obs=N Number of objects that were declared obsolete
226 Pages mrk=N Number of pages marked as being cached
227 unc=N Number of uncache page requests seen
228 Acquire n=N Number of acquire cookie requests seen
229 nul=N Number of acq reqs given a NULL parent
230 noc=N Number of acq reqs rejected due to no cache available
231 ok=N Number of acq reqs succeeded
232 nbf=N Number of acq reqs rejected due to error
233 oom=N Number of acq reqs failed on ENOMEM
234 Lookups n=N Number of lookup calls made on cache backends
235 neg=N Number of negative lookups made
236 pos=N Number of positive lookups made
237 crt=N Number of objects created by lookup
238 Updates n=N Number of update cookie requests seen
239 nul=N Number of upd reqs given a NULL parent
240 run=N Number of upd reqs granted CPU time
241 Relinqs n=N Number of relinquish cookie requests seen
242 nul=N Number of rlq reqs given a NULL parent
243 wcr=N Number of rlq reqs waited on completion of creation
244 AttrChg n=N Number of attribute changed requests seen
245 ok=N Number of attr changed requests queued
246 nbf=N Number of attr changed rejected -ENOBUFS
247 oom=N Number of attr changed failed -ENOMEM
248 run=N Number of attr changed ops given CPU time
249 Allocs n=N Number of allocation requests seen
250 ok=N Number of successful alloc reqs
251 wt=N Number of alloc reqs that waited on lookup completion
252 nbf=N Number of alloc reqs rejected -ENOBUFS
253 ops=N Number of alloc reqs submitted
254 owt=N Number of alloc reqs waited for CPU time
255 Retrvls n=N Number of retrieval (read) requests seen
256 ok=N Number of successful retr reqs
257 wt=N Number of retr reqs that waited on lookup completion
258 nod=N Number of retr reqs returned -ENODATA
259 nbf=N Number of retr reqs rejected -ENOBUFS
260 int=N Number of retr reqs aborted -ERESTARTSYS
261 oom=N Number of retr reqs failed -ENOMEM
262 ops=N Number of retr reqs submitted
263 owt=N Number of retr reqs waited for CPU time
264 Stores n=N Number of storage (write) requests seen
265 ok=N Number of successful store reqs
266 agn=N Number of store reqs on a page already pending storage
267 nbf=N Number of store reqs rejected -ENOBUFS
268 oom=N Number of store reqs failed -ENOMEM
269 ops=N Number of store reqs submitted
270 run=N Number of store reqs granted CPU time
271 Ops pend=N Number of times async ops added to pending queues
272 run=N Number of times async ops given CPU time
273 enq=N Number of times async ops queued for processing
274 dfr=N Number of async ops queued for deferred release
275 rel=N Number of async ops released
276 gc=N Number of deferred-release async ops garbage collected
277
278
279 (*) /proc/fs/fscache/histogram
280
281 cat /proc/fs/fscache/histogram
282 JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS
283 ===== ===== ========= ========= ========= ========= =========
284
285 This shows the breakdown of the number of times each amount of time
286 between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
287 columns are as follows:
288
289 COLUMN TIME MEASUREMENT
290 ======= =======================================================
291 OBJ INST Length of time to instantiate an object
292 OP RUNS Length of time a call to process an operation took
293 OBJ RUNS Length of time a call to process an object event took
294 RETRV DLY Time between an requesting a read and lookup completing
295 RETRIEVLS Time between beginning and end of a retrieval
296
297 Each row shows the number of events that took a particular range of times.
298 Each step is 1 jiffy in size. The JIFS column indicates the particular
299 jiffy range covered, and the SECS field the equivalent number of seconds.
300
301
302=========
303DEBUGGING
304=========
305
306If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
307debugging enabled by adjusting the value in:
308
309 /sys/module/fscache/parameters/debug
310
311This is a bitmask of debugging streams to enable:
312
313 BIT VALUE STREAM POINT
314 ======= ======= =============================== =======================
315 0 1 Cache management Function entry trace
316 1 2 Function exit trace
317 2 4 General
318 3 8 Cookie management Function entry trace
319 4 16 Function exit trace
320 5 32 General
321 6 64 Page handling Function entry trace
322 7 128 Function exit trace
323 8 256 General
324 9 512 Operation management Function entry trace
325 10 1024 Function exit trace
326 11 2048 General
327
328The appropriate set of values should be OR'd together and the result written to
329the control file. For example:
330
331 echo $((1|8|64)) >/sys/module/fscache/parameters/debug
332
333will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644
index 000000000000..4db125b3a5c6
--- /dev/null
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -0,0 +1,778 @@
1 ===============================
2 FS-CACHE NETWORK FILESYSTEM API
3 ===============================
4
5There's an API by which a network filesystem can make use of the FS-Cache
6facilities. This is based around a number of principles:
7
8 (1) Caches can store a number of different object types. There are two main
9 object types: indices and files. The first is a special type used by
10 FS-Cache to make finding objects faster and to make retiring of groups of
11 objects easier.
12
13 (2) Every index, file or other object is represented by a cookie. This cookie
14 may or may not have anything associated with it, but the netfs doesn't
15 need to care.
16
17 (3) Barring the top-level index (one entry per cached netfs), the index
18 hierarchy for each netfs is structured according the whim of the netfs.
19
20This API is declared in <linux/fscache.h>.
21
22This document contains the following sections:
23
24 (1) Network filesystem definition
25 (2) Index definition
26 (3) Object definition
27 (4) Network filesystem (un)registration
28 (5) Cache tag lookup
29 (6) Index registration
30 (7) Data file registration
31 (8) Miscellaneous object registration
32 (9) Setting the data file size
33 (10) Page alloc/read/write
34 (11) Page uncaching
35 (12) Index and data file update
36 (13) Miscellaneous cookie operations
37 (14) Cookie unregistration
38 (15) Index and data file invalidation
39 (16) FS-Cache specific page flags.
40
41
42=============================
43NETWORK FILESYSTEM DEFINITION
44=============================
45
46FS-Cache needs a description of the network filesystem. This is specified
47using a record of the following structure:
48
49 struct fscache_netfs {
50 uint32_t version;
51 const char *name;
52 struct fscache_cookie *primary_index;
53 ...
54 };
55
56This first two fields should be filled in before registration, and the third
57will be filled in by the registration function; any other fields should just be
58ignored and are for internal use only.
59
60The fields are:
61
62 (1) The name of the netfs (used as the key in the toplevel index).
63
64 (2) The version of the netfs (if the name matches but the version doesn't, the
65 entire in-cache hierarchy for this netfs will be scrapped and begun
66 afresh).
67
68 (3) The cookie representing the primary index will be allocated according to
69 another parameter passed into the registration function.
70
71For example, kAFS (linux/fs/afs/) uses the following definitions to describe
72itself:
73
74 struct fscache_netfs afs_cache_netfs = {
75 .version = 0,
76 .name = "afs",
77 };
78
79
80================
81INDEX DEFINITION
82================
83
84Indices are used for two purposes:
85
86 (1) To aid the finding of a file based on a series of keys (such as AFS's
87 "cell", "volume ID", "vnode ID").
88
89 (2) To make it easier to discard a subset of all the files cached based around
90 a particular key - for instance to mirror the removal of an AFS volume.
91
92However, since it's unlikely that any two netfs's are going to want to define
93their index hierarchies in quite the same way, FS-Cache tries to impose as few
94restraints as possible on how an index is structured and where it is placed in
95the tree. The netfs can even mix indices and data files at the same level, but
96it's not recommended.
97
98Each index entry consists of a key of indeterminate length plus some auxilliary
99data, also of indeterminate length.
100
101There are some limits on indices:
102
103 (1) Any index containing non-index objects should be restricted to a single
104 cache. Any such objects created within an index will be created in the
105 first cache only. The cache in which an index is created can be
106 controlled by cache tags (see below).
107
108 (2) The entry data must be atomically journallable, so it is limited to about
109 400 bytes at present. At least 400 bytes will be available.
110
111 (3) The depth of the index tree should be judged with care as the search
112 function is recursive. Too many layers will run the kernel out of stack.
113
114
115=================
116OBJECT DEFINITION
117=================
118
119To define an object, a structure of the following type should be filled out:
120
121 struct fscache_cookie_def
122 {
123 uint8_t name[16];
124 uint8_t type;
125
126 struct fscache_cache_tag *(*select_cache)(
127 const void *parent_netfs_data,
128 const void *cookie_netfs_data);
129
130 uint16_t (*get_key)(const void *cookie_netfs_data,
131 void *buffer,
132 uint16_t bufmax);
133
134 void (*get_attr)(const void *cookie_netfs_data,
135 uint64_t *size);
136
137 uint16_t (*get_aux)(const void *cookie_netfs_data,
138 void *buffer,
139 uint16_t bufmax);
140
141 enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
142 const void *data,
143 uint16_t datalen);
144
145 void (*get_context)(void *cookie_netfs_data, void *context);
146
147 void (*put_context)(void *cookie_netfs_data, void *context);
148
149 void (*mark_pages_cached)(void *cookie_netfs_data,
150 struct address_space *mapping,
151 struct pagevec *cached_pvec);
152
153 void (*now_uncached)(void *cookie_netfs_data);
154 };
155
156This has the following fields:
157
158 (1) The type of the object [mandatory].
159
160 This is one of the following values:
161
162 (*) FSCACHE_COOKIE_TYPE_INDEX
163
164 This defines an index, which is a special FS-Cache type.
165
166 (*) FSCACHE_COOKIE_TYPE_DATAFILE
167
168 This defines an ordinary data file.
169
170 (*) Any other value between 2 and 255
171
172 This defines an extraordinary object such as an XATTR.
173
174 (2) The name of the object type (NUL terminated unless all 16 chars are used)
175 [optional].
176
177 (3) A function to select the cache in which to store an index [optional].
178
179 This function is invoked when an index needs to be instantiated in a cache
180 during the instantiation of a non-index object. Only the immediate index
181 parent for the non-index object will be queried. Any indices above that
182 in the hierarchy may be stored in multiple caches. This function does not
183 need to be supplied for any non-index object or any index that will only
184 have index children.
185
186 If this function is not supplied or if it returns NULL then the first
187 cache in the parent's list will be chosed, or failing that, the first
188 cache in the master list.
189
190 (4) A function to retrieve an object's key from the netfs [mandatory].
191
192 This function will be called with the netfs data that was passed to the
193 cookie acquisition function and the maximum length of key data that it may
194 provide. It should write the required key data into the given buffer and
195 return the quantity it wrote.
196
197 (5) A function to retrieve attribute data from the netfs [optional].
198
199 This function will be called with the netfs data that was passed to the
200 cookie acquisition function. It should return the size of the file if
201 this is a data file. The size may be used to govern how much cache must
202 be reserved for this file in the cache.
203
204 If the function is absent, a file size of 0 is assumed.
205
206 (6) A function to retrieve auxilliary data from the netfs [optional].
207
208 This function will be called with the netfs data that was passed to the
209 cookie acquisition function and the maximum length of auxilliary data that
210 it may provide. It should write the auxilliary data into the given buffer
211 and return the quantity it wrote.
212
213 If this function is absent, the auxilliary data length will be set to 0.
214
215 The length of the auxilliary data buffer may be dependent on the key
216 length. A netfs mustn't rely on being able to provide more than 400 bytes
217 for both.
218
219 (7) A function to check the auxilliary data [optional].
220
221 This function will be called to check that a match found in the cache for
222 this object is valid. For instance with AFS it could check the auxilliary
223 data against the data version number returned by the server to determine
224 whether the index entry in a cache is still valid.
225
226 If this function is absent, it will be assumed that matching objects in a
227 cache are always valid.
228
229 If present, the function should return one of the following values:
230
231 (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is
232 (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update
233 (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted
234
235 This function can also be used to extract data from the auxilliary data in
236 the cache and copy it into the netfs's structures.
237
238 (8) A pair of functions to manage contexts for the completion callback
239 [optional].
240
241 The cache read/write functions are passed a context which is then passed
242 to the I/O completion callback function. To ensure this context remains
243 valid until after the I/O completion is called, two functions may be
244 provided: one to get an extra reference on the context, and one to drop a
245 reference to it.
246
247 If the context is not used or is a type of object that won't go out of
248 scope, then these functions are not required. These functions are not
249 required for indices as indices may not contain data. These functions may
250 be called in interrupt context and so may not sleep.
251
252 (9) A function to mark a page as retaining cache metadata [optional].
253
254 This is called by the cache to indicate that it is retaining in-memory
255 information for this page and that the netfs should uncache the page when
256 it has finished. This does not indicate whether there's data on the disk
257 or not. Note that several pages at once may be presented for marking.
258
259 The PG_fscache bit is set on the pages before this function would be
260 called, so the function need not be provided if this is sufficient.
261
262 This function is not required for indices as they're not permitted data.
263
264(10) A function to unmark all the pages retaining cache metadata [mandatory].
265
266 This is called by FS-Cache to indicate that a backing store is being
267 unbound from a cookie and that all the marks on the pages should be
268 cleared to prevent confusion. Note that the cache will have torn down all
269 its tracking information so that the pages don't need to be explicitly
270 uncached.
271
272 This function is not required for indices as they're not permitted data.
273
274
275===================================
276NETWORK FILESYSTEM (UN)REGISTRATION
277===================================
278
279The first step is to declare the network filesystem to the cache. This also
280involves specifying the layout of the primary index (for AFS, this would be the
281"cell" level).
282
283The registration function is:
284
285 int fscache_register_netfs(struct fscache_netfs *netfs);
286
287It just takes a pointer to the netfs definition. It returns 0 or an error as
288appropriate.
289
290For kAFS, registration is done as follows:
291
292 ret = fscache_register_netfs(&afs_cache_netfs);
293
294The last step is, of course, unregistration:
295
296 void fscache_unregister_netfs(struct fscache_netfs *netfs);
297
298
299================
300CACHE TAG LOOKUP
301================
302
303FS-Cache permits the use of more than one cache. To permit particular index
304subtrees to be bound to particular caches, the second step is to look up cache
305representation tags. This step is optional; it can be left entirely up to
306FS-Cache as to which cache should be used. The problem with doing that is that
307FS-Cache will always pick the first cache that was registered.
308
309To get the representation for a named tag:
310
311 struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
312
313This takes a text string as the name and returns a representation of a tag. It
314will never return an error. It may return a dummy tag, however, if it runs out
315of memory; this will inhibit caching with this tag.
316
317Any representation so obtained must be released by passing it to this function:
318
319 void fscache_release_cache_tag(struct fscache_cache_tag *tag);
320
321The tag will be retrieved by FS-Cache when it calls the object definition
322operation select_cache().
323
324
325==================
326INDEX REGISTRATION
327==================
328
329The third step is to inform FS-Cache about part of an index hierarchy that can
330be used to locate files. This is done by requesting a cookie for each index in
331the path to the file:
332
333 struct fscache_cookie *
334 fscache_acquire_cookie(struct fscache_cookie *parent,
335 const struct fscache_object_def *def,
336 void *netfs_data);
337
338This function creates an index entry in the index represented by parent,
339filling in the index entry by calling the operations pointed to by def.
340
341Note that this function never returns an error - all errors are handled
342internally. It may, however, return NULL to indicate no cookie. It is quite
343acceptable to pass this token back to this function as the parent to another
344acquisition (or even to the relinquish cookie, read page and write page
345functions - see below).
346
347Note also that no indices are actually created in a cache until a non-index
348object needs to be created somewhere down the hierarchy. Furthermore, an index
349may be created in several different caches independently at different times.
350This is all handled transparently, and the netfs doesn't see any of it.
351
352For example, with AFS, a cell would be added to the primary index. This index
353entry would have a dependent inode containing a volume location index for the
354volume mappings within this cell:
355
356 cell->cache =
357 fscache_acquire_cookie(afs_cache_netfs.primary_index,
358 &afs_cell_cache_index_def,
359 cell);
360
361Then when a volume location was accessed, it would be entered into the cell's
362index and an inode would be allocated that acts as a volume type and hash chain
363combination:
364
365 vlocation->cache =
366 fscache_acquire_cookie(cell->cache,
367 &afs_vlocation_cache_index_def,
368 vlocation);
369
370And then a particular flavour of volume (R/O for example) could be added to
371that index, creating another index for vnodes (AFS inode equivalents):
372
373 volume->cache =
374 fscache_acquire_cookie(vlocation->cache,
375 &afs_volume_cache_index_def,
376 volume);
377
378
379======================
380DATA FILE REGISTRATION
381======================
382
383The fourth step is to request a data file be created in the cache. This is
384identical to index cookie acquisition. The only difference is that the type in
385the object definition should be something other than index type.
386
387 vnode->cache =
388 fscache_acquire_cookie(volume->cache,
389 &afs_vnode_cache_object_def,
390 vnode);
391
392
393=================================
394MISCELLANEOUS OBJECT REGISTRATION
395=================================
396
397An optional step is to request an object of miscellaneous type be created in
398the cache. This is almost identical to index cookie acquisition. The only
399difference is that the type in the object definition should be something other
400than index type. Whilst the parent object could be an index, it's more likely
401it would be some other type of object such as a data file.
402
403 xattr->cache =
404 fscache_acquire_cookie(vnode->cache,
405 &afs_xattr_cache_object_def,
406 xattr);
407
408Miscellaneous objects might be used to store extended attributes or directory
409entries for example.
410
411
412==========================
413SETTING THE DATA FILE SIZE
414==========================
415
416The fifth step is to set the physical attributes of the file, such as its size.
417This doesn't automatically reserve any space in the cache, but permits the
418cache to adjust its metadata for data tracking appropriately:
419
420 int fscache_attr_changed(struct fscache_cookie *cookie);
421
422The cache will return -ENOBUFS if there is no backing cache or if there is no
423space to allocate any extra metadata required in the cache. The attributes
424will be accessed with the get_attr() cookie definition operation.
425
426Note that attempts to read or write data pages in the cache over this size may
427be rebuffed with -ENOBUFS.
428
429This operation schedules an attribute adjustment to happen asynchronously at
430some point in the future, and as such, it may happen after the function returns
431to the caller. The attribute adjustment excludes read and write operations.
432
433
434=====================
435PAGE READ/ALLOC/WRITE
436=====================
437
438And the sixth step is to store and retrieve pages in the cache. There are
439three functions that are used to do this.
440
441Note:
442
443 (1) A page should not be re-read or re-allocated without uncaching it first.
444
445 (2) A read or allocated page must be uncached when the netfs page is released
446 from the pagecache.
447
448 (3) A page should only be written to the cache if previous read or allocated.
449
450This permits the cache to maintain its page tracking in proper order.
451
452
453PAGE READ
454---------
455
456Firstly, the netfs should ask FS-Cache to examine the caches and read the
457contents cached for a particular page of a particular file if present, or else
458allocate space to store the contents if not:
459
460 typedef
461 void (*fscache_rw_complete_t)(struct page *page,
462 void *context,
463 int error);
464
465 int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
466 struct page *page,
467 fscache_rw_complete_t end_io_func,
468 void *context,
469 gfp_t gfp);
470
471The cookie argument must specify a cookie for an object that isn't an index,
472the page specified will have the data loaded into it (and is also used to
473specify the page number), and the gfp argument is used to control how any
474memory allocations made are satisfied.
475
476If the cookie indicates the inode is not cached:
477
478 (1) The function will return -ENOBUFS.
479
480Else if there's a copy of the page resident in the cache:
481
482 (1) The mark_pages_cached() cookie operation will be called on that page.
483
484 (2) The function will submit a request to read the data from the cache's
485 backing device directly into the page specified.
486
487 (3) The function will return 0.
488
489 (4) When the read is complete, end_io_func() will be invoked with:
490
491 (*) The netfs data supplied when the cookie was created.
492
493 (*) The page descriptor.
494
495 (*) The context argument passed to the above function. This will be
496 maintained with the get_context/put_context functions mentioned above.
497
498 (*) An argument that's 0 on success or negative for an error code.
499
500 If an error occurs, it should be assumed that the page contains no usable
501 data.
502
503 end_io_func() will be called in process context if the read is results in
504 an error, but it might be called in interrupt context if the read is
505 successful.
506
507Otherwise, if there's not a copy available in cache, but the cache may be able
508to store the page:
509
510 (1) The mark_pages_cached() cookie operation will be called on that page.
511
512 (2) A block may be reserved in the cache and attached to the object at the
513 appropriate place.
514
515 (3) The function will return -ENODATA.
516
517This function may also return -ENOMEM or -EINTR, in which case it won't have
518read any data from the cache.
519
520
521PAGE ALLOCATE
522-------------
523
524Alternatively, if there's not expected to be any data in the cache for a page
525because the file has been extended, a block can simply be allocated instead:
526
527 int fscache_alloc_page(struct fscache_cookie *cookie,
528 struct page *page,
529 gfp_t gfp);
530
531This is similar to the fscache_read_or_alloc_page() function, except that it
532never reads from the cache. It will return 0 if a block has been allocated,
533rather than -ENODATA as the other would. One or the other must be performed
534before writing to the cache.
535
536The mark_pages_cached() cookie operation will be called on the page if
537successful.
538
539
540PAGE WRITE
541----------
542
543Secondly, if the netfs changes the contents of the page (either due to an
544initial download or if a user performs a write), then the page should be
545written back to the cache:
546
547 int fscache_write_page(struct fscache_cookie *cookie,
548 struct page *page,
549 gfp_t gfp);
550
551The cookie argument must specify a data file cookie, the page specified should
552contain the data to be written (and is also used to specify the page number),
553and the gfp argument is used to control how any memory allocations made are
554satisfied.
555
556The page must have first been read or allocated successfully and must not have
557been uncached before writing is performed.
558
559If the cookie indicates the inode is not cached then:
560
561 (1) The function will return -ENOBUFS.
562
563Else if space can be allocated in the cache to hold this page:
564
565 (1) PG_fscache_write will be set on the page.
566
567 (2) The function will submit a request to write the data to cache's backing
568 device directly from the page specified.
569
570 (3) The function will return 0.
571
572 (4) When the write is complete PG_fscache_write is cleared on the page and
573 anyone waiting for that bit will be woken up.
574
575Else if there's no space available in the cache, -ENOBUFS will be returned. It
576is also possible for the PG_fscache_write bit to be cleared when no write took
577place if unforeseen circumstances arose (such as a disk error).
578
579Writing takes place asynchronously.
580
581
582MULTIPLE PAGE READ
583------------------
584
585A facility is provided to read several pages at once, as requested by the
586readpages() address space operation:
587
588 int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
589 struct address_space *mapping,
590 struct list_head *pages,
591 int *nr_pages,
592 fscache_rw_complete_t end_io_func,
593 void *context,
594 gfp_t gfp);
595
596This works in a similar way to fscache_read_or_alloc_page(), except:
597
598 (1) Any page it can retrieve data for is removed from pages and nr_pages and
599 dispatched for reading to the disk. Reads of adjacent pages on disk may
600 be merged for greater efficiency.
601
602 (2) The mark_pages_cached() cookie operation will be called on several pages
603 at once if they're being read or allocated.
604
605 (3) If there was an general error, then that error will be returned.
606
607 Else if some pages couldn't be allocated or read, then -ENOBUFS will be
608 returned.
609
610 Else if some pages couldn't be read but were allocated, then -ENODATA will
611 be returned.
612
613 Otherwise, if all pages had reads dispatched, then 0 will be returned, the
614 list will be empty and *nr_pages will be 0.
615
616 (4) end_io_func will be called once for each page being read as the reads
617 complete. It will be called in process context if error != 0, but it may
618 be called in interrupt context if there is no error.
619
620Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
621some of the pages being read and some being allocated. Those pages will have
622been marked appropriately and will need uncaching.
623
624
625==============
626PAGE UNCACHING
627==============
628
629To uncache a page, this function should be called:
630
631 void fscache_uncache_page(struct fscache_cookie *cookie,
632 struct page *page);
633
634This function permits the cache to release any in-memory representation it
635might be holding for this netfs page. This function must be called once for
636each page on which the read or write page functions above have been called to
637make sure the cache's in-memory tracking information gets torn down.
638
639Note that pages can't be explicitly deleted from the a data file. The whole
640data file must be retired (see the relinquish cookie function below).
641
642Furthermore, note that this does not cancel the asynchronous read or write
643operation started by the read/alloc and write functions, so the page
644invalidation and release functions must use:
645
646 bool fscache_check_page_write(struct fscache_cookie *cookie,
647 struct page *page);
648
649to see if a page is being written to the cache, and:
650
651 void fscache_wait_on_page_write(struct fscache_cookie *cookie,
652 struct page *page);
653
654to wait for it to finish if it is.
655
656
657==========================
658INDEX AND DATA FILE UPDATE
659==========================
660
661To request an update of the index data for an index or other object, the
662following function should be called:
663
664 void fscache_update_cookie(struct fscache_cookie *cookie);
665
666This function will refer back to the netfs_data pointer stored in the cookie by
667the acquisition function to obtain the data to write into each revised index
668entry. The update method in the parent index definition will be called to
669transfer the data.
670
671Note that partial updates may happen automatically at other times, such as when
672data blocks are added to a data file object.
673
674
675===============================
676MISCELLANEOUS COOKIE OPERATIONS
677===============================
678
679There are a number of operations that can be used to control cookies:
680
681 (*) Cookie pinning:
682
683 int fscache_pin_cookie(struct fscache_cookie *cookie);
684 void fscache_unpin_cookie(struct fscache_cookie *cookie);
685
686 These operations permit data cookies to be pinned into the cache and to
687 have the pinning removed. They are not permitted on index cookies.
688
689 The pinning function will return 0 if successful, -ENOBUFS in the cookie
690 isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
691 -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
692 -EIO if there's any other problem.
693
694 (*) Data space reservation:
695
696 int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
697
698 This permits a netfs to request cache space be reserved to store up to the
699 given amount of a file. It is permitted to ask for more than the current
700 size of the file to allow for future file expansion.
701
702 If size is given as zero then the reservation will be cancelled.
703
704 The function will return 0 if successful, -ENOBUFS in the cookie isn't
705 backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
706 -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
707 -EIO if there's any other problem.
708
709 Note that this doesn't pin an object in a cache; it can still be culled to
710 make space if it's not in use.
711
712
713=====================
714COOKIE UNREGISTRATION
715=====================
716
717To get rid of a cookie, this function should be called.
718
719 void fscache_relinquish_cookie(struct fscache_cookie *cookie,
720 int retire);
721
722If retire is non-zero, then the object will be marked for recycling, and all
723copies of it will be removed from all active caches in which it is present.
724Not only that but all child objects will also be retired.
725
726If retire is zero, then the object may be available again when next the
727acquisition function is called. Retirement here will overrule the pinning on a
728cookie.
729
730One very important note - relinquish must NOT be called for a cookie unless all
731the cookies for "child" indices, objects and pages have been relinquished
732first.
733
734
735================================
736INDEX AND DATA FILE INVALIDATION
737================================
738
739There is no direct way to invalidate an index subtree or a data file. To do
740this, the caller should relinquish and retire the cookie they have, and then
741acquire a new one.
742
743
744===========================
745FS-CACHE SPECIFIC PAGE FLAG
746===========================
747
748FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is
749given the alternative name PG_fscache.
750
751PG_fscache is used to indicate that the page is known by the cache, and that
752the cache must be informed if the page is going to go away. It's an indication
753to the netfs that the cache has an interest in this page, where an interest may
754be a pointer to it, resources allocated or reserved for it, or I/O in progress
755upon it.
756
757The netfs can use this information in methods such as releasepage() to
758determine whether it needs to uncache a page or update it.
759
760Furthermore, if this bit is set, releasepage() and invalidatepage() operations
761will be called on a page to get rid of it, even if PG_private is not set. This
762allows caching to attempted on a page before read_cache_pages() to be called
763after fscache_read_or_alloc_pages() as the former will try and release pages it
764was given under certain circumstances.
765
766This bit does not overlap with such as PG_private. This means that FS-Cache
767can be used with a filesystem that uses the block buffering code.
768
769There are a number of operations defined on this flag:
770
771 int PageFsCache(struct page *page);
772 void SetPageFsCache(struct page *page)
773 void ClearPageFsCache(struct page *page)
774 int TestSetPageFsCache(struct page *page)
775 int TestClearPageFsCache(struct page *page)
776
777These functions are bit test, bit set, bit clear, bit test and set and bit
778test and clear operations on PG_fscache.
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt
new file mode 100644
index 000000000000..e8b0a35d8fe5
--- /dev/null
+++ b/Documentation/filesystems/caching/object.txt
@@ -0,0 +1,313 @@
1 ====================================================
2 IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT
3 ====================================================
4
5By: David Howells <dhowells@redhat.com>
6
7Contents:
8
9 (*) Representation
10
11 (*) Object management state machine.
12
13 - Provision of cpu time.
14 - Locking simplification.
15
16 (*) The set of states.
17
18 (*) The set of events.
19
20
21==============
22REPRESENTATION
23==============
24
25FS-Cache maintains an in-kernel representation of each object that a netfs is
26currently interested in. Such objects are represented by the fscache_cookie
27struct and are referred to as cookies.
28
29FS-Cache also maintains a separate in-kernel representation of the objects that
30a cache backend is currently actively caching. Such objects are represented by
31the fscache_object struct. The cache backends allocate these upon request, and
32are expected to embed them in their own representations. These are referred to
33as objects.
34
35There is a 1:N relationship between cookies and objects. A cookie may be
36represented by multiple objects - an index may exist in more than one cache -
37or even by no objects (it may not be cached).
38
39Furthermore, both cookies and objects are hierarchical. The two hierarchies
40correspond, but the cookies tree is a superset of the union of the object trees
41of multiple caches:
42
43 NETFS INDEX TREE : CACHE 1 : CACHE 2
44 : :
45 : +-----------+ :
46 +----------->| IObject | :
47 +-----------+ | : +-----------+ :
48 | ICookie |-------+ : | :
49 +-----------+ | : | : +-----------+
50 | +------------------------------>| IObject |
51 | : | : +-----------+
52 | : V : |
53 | : +-----------+ : |
54 V +----------->| IObject | : |
55 +-----------+ | : +-----------+ : |
56 | ICookie |-------+ : | : V
57 +-----------+ | : | : +-----------+
58 | +------------------------------>| IObject |
59 +-----+-----+ : | : +-----------+
60 | | : | : |
61 V | : V : |
62 +-----------+ | : +-----------+ : |
63 | ICookie |------------------------->| IObject | : |
64 +-----------+ | : +-----------+ : |
65 | V : | : V
66 | +-----------+ : | : +-----------+
67 | | ICookie |-------------------------------->| IObject |
68 | +-----------+ : | : +-----------+
69 V | : V : |
70 +-----------+ | : +-----------+ : |
71 | DCookie |------------------------->| DObject | : |
72 +-----------+ | : +-----------+ : |
73 | : : |
74 +-------+-------+ : : |
75 | | : : |
76 V V : : V
77 +-----------+ +-----------+ : : +-----------+
78 | DCookie | | DCookie |------------------------>| DObject |
79 +-----------+ +-----------+ : : +-----------+
80 : :
81
82In the above illustration, ICookie and IObject represent indices and DCookie
83and DObject represent data storage objects. Indices may have representation in
84multiple caches, but currently, non-index objects may not. Objects of any type
85may also be entirely unrepresented.
86
87As far as the netfs API goes, the netfs is only actually permitted to see
88pointers to the cookies. The cookies themselves and any objects attached to
89those cookies are hidden from it.
90
91
92===============================
93OBJECT MANAGEMENT STATE MACHINE
94===============================
95
96Within FS-Cache, each active object is managed by its own individual state
97machine. The state for an object is kept in the fscache_object struct, in
98object->state. A cookie may point to a set of objects that are in different
99states.
100
101Each state has an action associated with it that is invoked when the machine
102wakes up in that state. There are four logical sets of states:
103
104 (1) Preparation: states that wait for the parent objects to become ready. The
105 representations are hierarchical, and it is expected that an object must
106 be created or accessed with respect to its parent object.
107
108 (2) Initialisation: states that perform lookups in the cache and validate
109 what's found and that create on disk any missing metadata.
110
111 (3) Normal running: states that allow netfs operations on objects to proceed
112 and that update the state of objects.
113
114 (4) Termination: states that detach objects from their netfs cookies, that
115 delete objects from disk, that handle disk and system errors and that free
116 up in-memory resources.
117
118
119In most cases, transitioning between states is in response to signalled events.
120When a state has finished processing, it will usually set the mask of events in
121which it is interested (object->event_mask) and relinquish the worker thread.
122Then when an event is raised (by calling fscache_raise_event()), if the event
123is not masked, the object will be queued for processing (by calling
124fscache_enqueue_object()).
125
126
127PROVISION OF CPU TIME
128---------------------
129
130The work to be done by the various states is given CPU time by the threads of
131the slow work facility (see Documentation/slow-work.txt). This is used in
132preference to the workqueue facility because:
133
134 (1) Threads may be completely occupied for very long periods of time by a
135 particular work item. These state actions may be doing sequences of
136 synchronous, journalled disk accesses (lookup, mkdir, create, setxattr,
137 getxattr, truncate, unlink, rmdir, rename).
138
139 (2) Threads may do little actual work, but may rather spend a lot of time
140 sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded
141 workqueues don't necessarily have the right numbers of threads.
142
143
144LOCKING SIMPLIFICATION
145----------------------
146
147Because only one worker thread may be operating on any particular object's
148state machine at once, this simplifies the locking, particularly with respect
149to disconnecting the netfs's representation of a cache object (fscache_cookie)
150from the cache backend's representation (fscache_object) - which may be
151requested from either end.
152
153
154=================
155THE SET OF STATES
156=================
157
158The object state machine has a set of states that it can be in. There are
159preparation states in which the object sets itself up and waits for its parent
160object to transit to a state that allows access to its children:
161
162 (1) State FSCACHE_OBJECT_INIT.
163
164 Initialise the object and wait for the parent object to become active. In
165 the cache, it is expected that it will not be possible to look an object
166 up from the parent object, until that parent object itself has been looked
167 up.
168
169There are initialisation states in which the object sets itself up and accesses
170disk for the object metadata:
171
172 (2) State FSCACHE_OBJECT_LOOKING_UP.
173
174 Look up the object on disk, using the parent as a starting point.
175 FS-Cache expects the cache backend to probe the cache to see whether this
176 object is represented there, and if it is, to see if it's valid (coherency
177 management).
178
179 The cache should call fscache_object_lookup_negative() to indicate lookup
180 failure for whatever reason, and should call fscache_obtained_object() to
181 indicate success.
182
183 At the completion of lookup, FS-Cache will let the netfs go ahead with
184 read operations, no matter whether the file is yet cached. If not yet
185 cached, read operations will be immediately rejected with ENODATA until
186 the first known page is uncached - as to that point there can be no data
187 to be read out of the cache for that file that isn't currently also held
188 in the pagecache.
189
190 (3) State FSCACHE_OBJECT_CREATING.
191
192 Create an object on disk, using the parent as a starting point. This
193 happens if the lookup failed to find the object, or if the object's
194 coherency data indicated what's on disk is out of date. In this state,
195 FS-Cache expects the cache to create
196
197 The cache should call fscache_obtained_object() if creation completes
198 successfully, fscache_object_lookup_negative() otherwise.
199
200 At the completion of creation, FS-Cache will start processing write
201 operations the netfs has queued for an object. If creation failed, the
202 write ops will be transparently discarded, and nothing recorded in the
203 cache.
204
205There are some normal running states in which the object spends its time
206servicing netfs requests:
207
208 (4) State FSCACHE_OBJECT_AVAILABLE.
209
210 A transient state in which pending operations are started, child objects
211 are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary
212 lookup data is freed.
213
214 (5) State FSCACHE_OBJECT_ACTIVE.
215
216 The normal running state. In this state, requests the netfs makes will be
217 passed on to the cache.
218
219 (6) State FSCACHE_OBJECT_UPDATING.
220
221 The state machine comes here to update the object in the cache from the
222 netfs's records. This involves updating the auxiliary data that is used
223 to maintain coherency.
224
225And there are terminal states in which an object cleans itself up, deallocates
226memory and potentially deletes stuff from disk:
227
228 (7) State FSCACHE_OBJECT_LC_DYING.
229
230 The object comes here if it is dying because of a lookup or creation
231 error. This would be due to a disk error or system error of some sort.
232 Temporary data is cleaned up, and the parent is released.
233
234 (8) State FSCACHE_OBJECT_DYING.
235
236 The object comes here if it is dying due to an error, because its parent
237 cookie has been relinquished by the netfs or because the cache is being
238 withdrawn.
239
240 Any child objects waiting on this one are given CPU time so that they too
241 can destroy themselves. This object waits for all its children to go away
242 before advancing to the next state.
243
244 (9) State FSCACHE_OBJECT_ABORT_INIT.
245
246 The object comes to this state if it was waiting on its parent in
247 FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself
248 so that the parent may proceed from the FSCACHE_OBJECT_DYING state.
249
250(10) State FSCACHE_OBJECT_RELEASING.
251(11) State FSCACHE_OBJECT_RECYCLING.
252
253 The object comes to one of these two states when dying once it is rid of
254 all its children, if it is dying because the netfs relinquished its
255 cookie. In the first state, the cached data is expected to persist, and
256 in the second it will be deleted.
257
258(12) State FSCACHE_OBJECT_WITHDRAWING.
259
260 The object transits to this state if the cache decides it wants to
261 withdraw the object from service, perhaps to make space, but also due to
262 error or just because the whole cache is being withdrawn.
263
264(13) State FSCACHE_OBJECT_DEAD.
265
266 The object transits to this state when the in-memory object record is
267 ready to be deleted. The object processor shouldn't ever see an object in
268 this state.
269
270
271THE SET OF EVENTS
272-----------------
273
274There are a number of events that can be raised to an object state machine:
275
276 (*) FSCACHE_OBJECT_EV_UPDATE
277
278 The netfs requested that an object be updated. The state machine will ask
279 the cache backend to update the object, and the cache backend will ask the
280 netfs for details of the change through its cookie definition ops.
281
282 (*) FSCACHE_OBJECT_EV_CLEARED
283
284 This is signalled in two circumstances:
285
286 (a) when an object's last child object is dropped and
287
288 (b) when the last operation outstanding on an object is completed.
289
290 This is used to proceed from the dying state.
291
292 (*) FSCACHE_OBJECT_EV_ERROR
293
294 This is signalled when an I/O error occurs during the processing of some
295 object.
296
297 (*) FSCACHE_OBJECT_EV_RELEASE
298 (*) FSCACHE_OBJECT_EV_RETIRE
299
300 These are signalled when the netfs relinquishes a cookie it was using.
301 The event selected depends on whether the netfs asks for the backing
302 object to be retired (deleted) or retained.
303
304 (*) FSCACHE_OBJECT_EV_WITHDRAW
305
306 This is signalled when the cache backend wants to withdraw an object.
307 This means that the object will have to be detached from the netfs's
308 cookie.
309
310Because the withdrawing releasing/retiring events are all handled by the object
311state machine, it doesn't matter if there's a collision with both ends trying
312to sever the connection at the same time. The state machine can just pick
313which one it wants to honour, and that effects the other.
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
new file mode 100644
index 000000000000..b6b070c57cbf
--- /dev/null
+++ b/Documentation/filesystems/caching/operations.txt
@@ -0,0 +1,213 @@
1 ================================
2 ASYNCHRONOUS OPERATIONS HANDLING
3 ================================
4
5By: David Howells <dhowells@redhat.com>
6
7Contents:
8
9 (*) Overview.
10
11 (*) Operation record initialisation.
12
13 (*) Parameters.
14
15 (*) Procedure.
16
17 (*) Asynchronous callback.
18
19
20========
21OVERVIEW
22========
23
24FS-Cache has an asynchronous operations handling facility that it uses for its
25data storage and retrieval routines. Its operations are represented by
26fscache_operation structs, though these are usually embedded into some other
27structure.
28
29This facility is available to and expected to be be used by the cache backends,
30and FS-Cache will create operations and pass them off to the appropriate cache
31backend for completion.
32
33To make use of this facility, <linux/fscache-cache.h> should be #included.
34
35
36===============================
37OPERATION RECORD INITIALISATION
38===============================
39
40An operation is recorded in an fscache_operation struct:
41
42 struct fscache_operation {
43 union {
44 struct work_struct fast_work;
45 struct slow_work slow_work;
46 };
47 unsigned long flags;
48 fscache_operation_processor_t processor;
49 ...
50 };
51
52Someone wanting to issue an operation should allocate something with this
53struct embedded in it. They should initialise it by calling:
54
55 void fscache_operation_init(struct fscache_operation *op,
56 fscache_operation_release_t release);
57
58with the operation to be initialised and the release function to use.
59
60The op->flags parameter should be set to indicate the CPU time provision and
61the exclusivity (see the Parameters section).
62
63The op->fast_work, op->slow_work and op->processor flags should be set as
64appropriate for the CPU time provision (see the Parameters section).
65
66FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the
67operation and waited for afterwards.
68
69
70==========
71PARAMETERS
72==========
73
74There are a number of parameters that can be set in the operation record's flag
75parameter. There are three options for the provision of CPU time in these
76operations:
77
78 (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread
79 may decide it wants to handle an operation itself without deferring it to
80 another thread.
81
82 This is, for example, used in read operations for calling readpages() on
83 the backing filesystem in CacheFiles. Although readpages() does an
84 asynchronous data fetch, the determination of whether pages exist is done
85 synchronously - and the netfs does not proceed until this has been
86 determined.
87
88 If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags
89 before submitting the operation, and the operating thread must wait for it
90 to be cleared before proceeding:
91
92 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
93 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
94
95
96 (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it
97 will be given to keventd to process. Such an operation is not permitted
98 to sleep on I/O.
99
100 This is, for example, used by CacheFiles to copy data from a backing fs
101 page to a netfs page after the backing fs has read the page in.
102
103 If this option is used, op->fast_work and op->processor must be
104 initialised before submitting the operation:
105
106 INIT_WORK(&op->fast_work, do_some_work);
107
108
109 (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it
110 will be given to the slow work facility to process. Such an operation is
111 permitted to sleep on I/O.
112
113 This is, for example, used by FS-Cache to handle background writes of
114 pages that have just been fetched from a remote server.
115
116 If this option is used, op->slow_work and op->processor must be
117 initialised before submitting the operation:
118
119 fscache_operation_init_slow(op, processor)
120
121
122Furthermore, operations may be one of two types:
123
124 (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in
125 conjunction with any other operation on the object being operated upon.
126
127 An example of this is the attribute change operation, in which the file
128 being written to may need truncation.
129
130 (2) Shareable. Operations of this type may be running simultaneously. It's
131 up to the operation implementation to prevent interference between other
132 operations running at the same time.
133
134
135=========
136PROCEDURE
137=========
138
139Operations are used through the following procedure:
140
141 (1) The submitting thread must allocate the operation and initialise it
142 itself. Normally this would be part of a more specific structure with the
143 generic op embedded within.
144
145 (2) The submitting thread must then submit the operation for processing using
146 one of the following two functions:
147
148 int fscache_submit_op(struct fscache_object *object,
149 struct fscache_operation *op);
150
151 int fscache_submit_exclusive_op(struct fscache_object *object,
152 struct fscache_operation *op);
153
154 The first function should be used to submit non-exclusive ops and the
155 second to submit exclusive ones. The caller must still set the
156 FSCACHE_OP_EXCLUSIVE flag.
157
158 If successful, both functions will assign the operation to the specified
159 object and return 0. -ENOBUFS will be returned if the object specified is
160 permanently unavailable.
161
162 The operation manager will defer operations on an object that is still
163 undergoing lookup or creation. The operation will also be deferred if an
164 operation of conflicting exclusivity is in progress on the object.
165
166 If the operation is asynchronous, the manager will retain a reference to
167 it, so the caller should put their reference to it by passing it to:
168
169 void fscache_put_operation(struct fscache_operation *op);
170
171 (3) If the submitting thread wants to do the work itself, and has marked the
172 operation with FSCACHE_OP_MYTHREAD, then it should monitor
173 FSCACHE_OP_WAITING as described above and check the state of the object if
174 necessary (the object might have died whilst the thread was waiting).
175
176 When it has finished doing its processing, it should call
177 fscache_put_operation() on it.
178
179 (4) The operation holds an effective lock upon the object, preventing other
180 exclusive ops conflicting until it is released. The operation can be
181 enqueued for further immediate asynchronous processing by adjusting the
182 CPU time provisioning option if necessary, eg:
183
184 op->flags &= ~FSCACHE_OP_TYPE;
185 op->flags |= ~FSCACHE_OP_FAST;
186
187 and calling:
188
189 void fscache_enqueue_operation(struct fscache_operation *op)
190
191 This can be used to allow other things to have use of the worker thread
192 pools.
193
194
195=====================
196ASYNCHRONOUS CALLBACK
197=====================
198
199When used in asynchronous mode, the worker thread pool will invoke the
200processor method with a pointer to the operation. This should then get at the
201container struct by using container_of():
202
203 static void fscache_write_op(struct fscache_operation *_op)
204 {
205 struct fscache_storage *op =
206 container_of(_op, struct fscache_storage, op);
207 ...
208 }
209
210The caller holds a reference on the operation, and will invoke
211fscache_put_operation() when the processor function returns. The processor
212function is at liberty to call fscache_enqueue_operation() or to take extra
213references.
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644
index 000000000000..0ced74c2f73c
--- /dev/null
+++ b/Documentation/filesystems/exofs.txt
@@ -0,0 +1,176 @@
1===============================================================================
2WHAT IS EXOFS?
3===============================================================================
4
5exofs is a file system that uses an OSD and exports the API of a normal Linux
6file system. Users access exofs like any other local file system, and exofs
7will in turn issue commands to the local OSD initiator.
8
9OSD is a new T10 command set that views storage devices not as a large/flat
10array of sectors but as a container of objects, each having a length, quota,
11time attributes and more. Each object is addressed by a 64bit ID, and is
12contained in a 64bit ID partition. Each object has associated attributes
13attached to it, which are integral part of the object and provide metadata about
14the object. The standard defines some common obligatory attributes, but user
15attributes can be added as needed.
16
17===============================================================================
18ENVIRONMENT
19===============================================================================
20
21To use this file system, you need to have an object store to run it on. You
22may download a target from:
23http://open-osd.org
24
25See Documentation/scsi/osd.txt for how to setup a working osd environment.
26
27===============================================================================
28USAGE
29===============================================================================
30
311. Download and compile exofs and open-osd initiator:
32 You need an external Kernel source tree or kernel headers from your
33 distribution. (anything based on 2.6.26 or later).
34
35 a. download open-osd including exofs source using:
36 [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
37
38 b. Build the library module like this:
39 [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
40
41 This will build both the open-osd initiator as well as the exofs kernel
42 module. Use whatever parameters you compiled your Kernel with and
43 $(KER_DIR) above pointing to the Kernel you compile against. See the file
44 open-osd/top-level-Makefile for an example.
45
462. Get the OSD initiator and target set up properly, and login to the target.
47 See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
48 for example script that does all these steps.
49
503. Insmod the exofs.ko module:
51 [exofs]$ insmod exofs.ko
52
534. Make sure the directory where you want to mount exists. If not, create it.
54 (For example, mkdir /mnt/exofs)
55
565. At first run you will need to invoke the mkfs.exofs application
57
58 As an example, this will create the file system on:
59 /dev/osd0 partition ID 65536
60
61 mkfs.exofs --pid=65536 --format /dev/osd0
62
63 The --format is optional if not specified no OSD_FORMAT will be
64 preformed and a clean file system will be created in the specified pid,
65 in the available space of the target. (Use --format=size_in_meg to limit
66 the total LUN space available)
67
68 If pid already exist it will be deleted and a new one will be created in it's
69 place. Be careful.
70
71 An exofs lives inside a single OSD partition. You can create multiple exofs
72 filesystems on the same device using multiple pids.
73
74 (run mkfs.exofs without any parameters for usage help message)
75
766. Mount the file system.
77
78 For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
79
80 mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
81
827. For reference (See do-exofs example script):
83 do-exofs start - an example of how to perform the above steps.
84 do-exofs stop - an example of how to unmount the file system.
85 do-exofs format - an example of how to format and mkfs a new exofs.
86
878. Extra compilation flags (uncomment in fs/exofs/Kbuild):
88 CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
89
90===============================================================================
91exofs mount options
92===============================================================================
93Similar to any mount command:
94 mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
95
96Where:
97 -t exofs: specifies the exofs file system
98
99 /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
100 login into an OSD target.
101
102 mount_exofs_directory: The directory to mount the file system on
103
104 exofs specific options: Options are separated by commas (,)
105 pid=<integer> - The partition number to mount/create as
106 container of the filesystem.
107 This option is mandatory
108 to=<integer> - Timeout in ticks for a single command
109 default is (60 * HZ) [for debugging only]
110
111===============================================================================
112DESIGN
113===============================================================================
114
115* The file system control block (AKA on-disk superblock) resides in an object
116 with a special ID (defined in common.h).
117 Information included in the file system control block is used to fill the
118 in-memory superblock structure at mount time. This object is created before
119 the file system is used by mkexofs.c It contains information such as:
120 - The file system's magic number
121 - The next inode number to be allocated
122
123* Each file resides in its own object and contains the data (and it will be
124 possible to extend the file over multiple objects, though this has not been
125 implemented yet).
126
127* A directory is treated as a file, and essentially contains a list of <file
128 name, inode #> pairs for files that are found in that directory. The object
129 IDs correspond to the files' inode numbers and will be allocated according to
130 a bitmap (stored in a separate object). Now they are allocated using a
131 counter.
132
133* Each file's control block (AKA on-disk inode) is stored in its object's
134 attributes. This applies to both regular files and other types (directories,
135 device files, symlinks, etc.).
136
137* Credentials are generated per object (inode and superblock) when they is
138 created in memory (read off disk or created). The credential works for all
139 operations and is used as long as the object remains in memory.
140
141* Async OSD operations are used whenever possible, but the target may execute
142 them out of order. The operations that concern us are create, delete,
143 readpage, writepage, update_inode, and truncate. The following pairs of
144 operations should execute in the order written, and we need to prevent them
145 from executing in reverse order:
146 - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
147 flags. OBJ_CREATED is set when we know the object exists on the OSD -
148 in create's callback function, and when we successfully do a read_inode.
149 OBJ_2BCREATED is set in the beginning of the create function, so we
150 know that we should wait.
151 - create/delete: delete should wait until the object is created
152 on the OSD.
153 - create/readpage: readpage should be able to return a page
154 full of zeroes in this case. If there was a write already
155 en-route (i.e. create, writepage, readpage) then the page
156 would be locked, and so it would really be the same as
157 create/writepage.
158 - create/writepage: if writepage is called for a sync write, it
159 should wait until the object is created on the OSD.
160 Otherwise, it should just return.
161 - create/truncate: truncate should wait until the object is
162 created on the OSD.
163 - create/update_inode: update_inode should wait until the
164 object is created on the OSD.
165 - Handled by VFS locks:
166 - readpage/delete: shouldn't happen because of page lock.
167 - writepage/delete: shouldn't happen because of page lock.
168 - readpage/writepage: shouldn't happen because of page lock.
169
170===============================================================================
171LICENSE/COPYRIGHT
172===============================================================================
173The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
174version 2.6.10). All files include the original copyrights, and the license
175is GPL version 2 (only version 2, as is true for the Linux kernel). The
176Linux kernel can be downloaded from www.kernel.org.
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index e5f3833a6ef8..570f9bd9be2b 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -14,6 +14,11 @@ Options
14When mounting an ext3 filesystem, the following option are accepted: 14When mounting an ext3 filesystem, the following option are accepted:
15(*) == default 15(*) == default
16 16
17ro Mount filesystem read only. Note that ext3 will replay
18 the journal (and thus write to the partition) even when
19 mounted "read only". Mount options "ro,noload" can be
20 used to prevent writes to the filesystem.
21
17journal=update Update the ext3 file system's journal to the current 22journal=update Update the ext3 file system's journal to the current
18 format. 23 format.
19 24
@@ -27,7 +32,9 @@ journal_dev=devnum When the external journal device's major/minor numbers
27 identified through its new major/minor numbers encoded 32 identified through its new major/minor numbers encoded
28 in devnum. 33 in devnum.
29 34
30noload Don't load the journal on mounting. 35noload Don't load the journal on mounting. Note that this forces
36 mount of inconsistent filesystem, which can lead to
37 various problems.
31 38
32data=journal All data are committed into the journal prior to being 39data=journal All data are committed into the journal prior to being
33 written into the main file system. 40 written into the main file system.
@@ -92,9 +99,12 @@ nocheck
92 99
93debug Extra debugging information is sent to syslog. 100debug Extra debugging information is sent to syslog.
94 101
95errors=remount-ro(*) Remount the filesystem read-only on an error. 102errors=remount-ro Remount the filesystem read-only on an error.
96errors=continue Keep going on a filesystem error. 103errors=continue Keep going on a filesystem error.
97errors=panic Panic and halt the machine if an error occurs. 104errors=panic Panic and halt the machine if an error occurs.
105 (These mount options override the errors behavior
106 specified in the superblock, which can be
107 configured using tune2fs.)
98 108
99data_err=ignore(*) Just print an error message if an error occurs 109data_err=ignore(*) Just print an error message if an error occurs
100 in a file data buffer in ordered mode. 110 in a file data buffer in ordered mode.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc7291..97882df04865 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
85* extent format more robust in face of on-disk corruption due to magics, 85* extent format more robust in face of on-disk corruption due to magics,
86* internal redundancy in tree 86* internal redundancy in tree
87* improved file allocation (multi-block alloc) 87* improved file allocation (multi-block alloc)
88* fix 32000 subdirectory limit 88* lift 32000 subdirectory limit imposed by i_links_count[1]
89* nsec timestamps for mtime, atime, ctime, create time 89* nsec timestamps for mtime, atime, ctime, create time
90* inode version field on disk (NFSv4, Lustre) 90* inode version field on disk (NFSv4, Lustre)
91* reduced e2fsck time via uninit_bg feature 91* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force 100* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
101 the ordering) 101 the ordering)
102 102
103[1] Filesystems with a block size of 1k may see a limit imposed by the
104directory hash tree having a maximum depth of two.
105
1032.2 Candidate features for future inclusion 1062.2 Candidate features for future inclusion
104 107
105* Online defrag (patches available but not well tested) 108* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
180 performance. 183 performance.
181 184
182barrier=<0|1(*)> This enables/disables the use of write barriers in 185barrier=<0|1(*)> This enables/disables the use of write barriers in
183 the jbd code. barrier=0 disables, barrier=1 enables. 186barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
184 This also requires an IO stack which can support 187nobarrier This also requires an IO stack which can support
185 barriers, and if jbd gets an error on a barrier 188 barriers, and if jbd gets an error on a barrier
186 write, it will disable again with a warning. 189 write, it will disable again with a warning.
187 Write barriers enforce proper on-disk ordering 190 Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
189 safe to use, at some performance penalty. If 192 safe to use, at some performance penalty. If
190 your disks are battery-backed in one way or another, 193 your disks are battery-backed in one way or another,
191 disabling barriers may safely improve performance. 194 disabling barriers may safely improve performance.
195 The mount options "barrier" and "nobarrier" can
196 also be used to enable or disable barriers, for
197 consistency with other ext4 mount options.
192 198
193inode_readahead=n This tuning parameter controls the maximum 199inode_readahead=n This tuning parameter controls the maximum
194 number of inode table blocks that ext4's inode 200 number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
310 a slightly higher priority than the default I/O 316 a slightly higher priority than the default I/O
311 priority. 317 priority.
312 318
319auto_da_alloc(*) Many broken applications don't use fsync() when
320noauto_da_alloc replacing existing files via patterns such as
321 fd = open("foo.new")/write(fd,..)/close(fd)/
322 rename("foo.new", "foo"), or worse yet,
323 fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
324 If auto_da_alloc is enabled, ext4 will detect
325 the replace-via-rename and replace-via-truncate
326 patterns and force that any delayed allocation
327 blocks are allocated such that at the next
328 journal commit, in the default data=ordered
329 mode, the data blocks of the new file are forced
330 to disk before the rename() operation is
331 commited. This provides roughly the same level
332 of guarantees as ext3, and avoids the
333 "zero-length" problem that can happen when a
334 system crashes before the delayed allocation
335 blocks are forced to disk.
336
313Data Mode 337Data Mode
314========= 338=========
315There are 3 different data modes: 339There are 3 different data modes:
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt
new file mode 100644
index 000000000000..64ced5149d37
--- /dev/null
+++ b/Documentation/filesystems/knfsd-stats.txt
@@ -0,0 +1,159 @@
1
2Kernel NFS Server Statistics
3============================
4
5This document describes the format and semantics of the statistics
6which the kernel NFS server makes available to userspace. These
7statistics are available in several text form pseudo files, each of
8which is described separately below.
9
10In most cases you don't need to know these formats, as the nfsstat(8)
11program from the nfs-utils distribution provides a helpful command-line
12interface for extracting and printing them.
13
14All the files described here are formatted as a sequence of text lines,
15separated by newline '\n' characters. Lines beginning with a hash
16'#' character are comments intended for humans and should be ignored
17by parsing routines. All other lines contain a sequence of fields
18separated by whitespace.
19
20/proc/fs/nfsd/pool_stats
21------------------------
22
23This file is available in kernels from 2.6.30 onwards, if the
24/proc/fs/nfsd filesystem is mounted (it almost always should be).
25
26The first line is a comment which describes the fields present in
27all the other lines. The other lines present the following data as
28a sequence of unsigned decimal numeric fields. One line is shown
29for each NFS thread pool.
30
31All counters are 64 bits wide and wrap naturally. There is no way
32to zero these counters, instead applications should do their own
33rate conversion.
34
35pool
36 The id number of the NFS thread pool to which this line applies.
37 This number does not change.
38
39 Thread pool ids are a contiguous set of small integers starting
40 at zero. The maximum value depends on the thread pool mode, but
41 currently cannot be larger than the number of CPUs in the system.
42 Note that in the default case there will be a single thread pool
43 which contains all the nfsd threads and all the CPUs in the system,
44 and thus this file will have a single line with a pool id of "0".
45
46packets-arrived
47 Counts how many NFS packets have arrived. More precisely, this
48 is the number of times that the network stack has notified the
49 sunrpc server layer that new data may be available on a transport
50 (e.g. an NFS or UDP socket or an NFS/RDMA endpoint).
51
52 Depending on the NFS workload patterns and various network stack
53 effects (such as Large Receive Offload) which can combine packets
54 on the wire, this may be either more or less than the number
55 of NFS calls received (which statistic is available elsewhere).
56 However this is a more accurate and less workload-dependent measure
57 of how much CPU load is being placed on the sunrpc server layer
58 due to NFS network traffic.
59
60sockets-enqueued
61 Counts how many times an NFS transport is enqueued to wait for
62 an nfsd thread to service it, i.e. no nfsd thread was considered
63 available.
64
65 The circumstance this statistic tracks indicates that there was NFS
66 network-facing work to be done but it couldn't be done immediately,
67 thus introducing a small delay in servicing NFS calls. The ideal
68 rate of change for this counter is zero; significantly non-zero
69 values may indicate a performance limitation.
70
71 This can happen either because there are too few nfsd threads in the
72 thread pool for the NFS workload (the workload is thread-limited),
73 or because the NFS workload needs more CPU time than is available in
74 the thread pool (the workload is CPU-limited). In the former case,
75 configuring more nfsd threads will probably improve the performance
76 of the NFS workload. In the latter case, the sunrpc server layer is
77 already choosing not to wake idle nfsd threads because there are too
78 many nfsd threads which want to run but cannot, so configuring more
79 nfsd threads will make no difference whatsoever. The overloads-avoided
80 statistic (see below) can be used to distinguish these cases.
81
82threads-woken
83 Counts how many times an idle nfsd thread is woken to try to
84 receive some data from an NFS transport.
85
86 This statistic tracks the circumstance where incoming
87 network-facing NFS work is being handled quickly, which is a good
88 thing. The ideal rate of change for this counter will be close
89 to but less than the rate of change of the packets-arrived counter.
90
91overloads-avoided
92 Counts how many times the sunrpc server layer chose not to wake an
93 nfsd thread, despite the presence of idle nfsd threads, because
94 too many nfsd threads had been recently woken but could not get
95 enough CPU time to actually run.
96
97 This statistic counts a circumstance where the sunrpc layer
98 heuristically avoids overloading the CPU scheduler with too many
99 runnable nfsd threads. The ideal rate of change for this counter
100 is zero. Significant non-zero values indicate that the workload
101 is CPU limited. Usually this is associated with heavy CPU usage
102 on all the CPUs in the nfsd thread pool.
103
104 If a sustained large overloads-avoided rate is detected on a pool,
105 the top(1) utility should be used to check for the following
106 pattern of CPU usage on all the CPUs associated with the given
107 nfsd thread pool.
108
109 - %us ~= 0 (as you're *NOT* running applications on your NFS server)
110
111 - %wa ~= 0
112
113 - %id ~= 0
114
115 - %sy + %hi + %si ~= 100
116
117 If this pattern is seen, configuring more nfsd threads will *not*
118 improve the performance of the workload. If this patten is not
119 seen, then something more subtle is wrong.
120
121threads-timedout
122 Counts how many times an nfsd thread triggered an idle timeout,
123 i.e. was not woken to handle any incoming network packets for
124 some time.
125
126 This statistic counts a circumstance where there are more nfsd
127 threads configured than can be used by the NFS workload. This is
128 a clue that the number of nfsd threads can be reduced without
129 affecting performance. Unfortunately, it's only a clue and not
130 a strong indication, for a couple of reasons:
131
132 - Currently the rate at which the counter is incremented is quite
133 slow; the idle timeout is 60 minutes. Unless the NFS workload
134 remains constant for hours at a time, this counter is unlikely
135 to be providing information that is still useful.
136
137 - It is usually a wise policy to provide some slack,
138 i.e. configure a few more nfsds than are currently needed,
139 to allow for future spikes in load.
140
141
142Note that incoming packets on NFS transports will be dealt with in
143one of three ways. An nfsd thread can be woken (threads-woken counts
144this case), or the transport can be enqueued for later attention
145(sockets-enqueued counts this case), or the packet can be temporarily
146deferred because the transport is currently being used by an nfsd
147thread. This last case is not very interesting and is not explicitly
148counted, but can be inferred from the other counters thus:
149
150packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken )
151
152
153More
154----
155Descriptions of the other statistics file should go here.
156
157
158Greg Banks <gnb@sgi.com>
15926 Mar 2009
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt
new file mode 100644
index 000000000000..05d81cbcb2e1
--- /dev/null
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -0,0 +1,161 @@
1NFSv4.1 Server Implementation
2
3Server support for minorversion 1 can be controlled using the
4/proc/fs/nfsd/versions control file. The string output returned
5by reading this file will contain either "+4.1" or "-4.1"
6correspondingly.
7
8Currently, server support for minorversion 1 is disabled by default.
9It can be enabled at run time by writing the string "+4.1" to
10the /proc/fs/nfsd/versions control file. Note that to write this
11control file, the nfsd service must be taken down. Use your user-mode
12nfs-utils to set this up; see rpc.nfsd(8)
13
14The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
15on the latest NFSv4.1 Internet Draft:
16http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
17
18From the many new features in NFSv4.1 the current implementation
19focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
20"exactly once" semantics and better control and throttling of the
21resources allocated for each client.
22
23Other NFSv4.1 features, Parallel NFS operations in particular,
24are still under development out of tree.
25See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
26for more information.
27
28The table below, taken from the NFSv4.1 document, lists
29the operations that are mandatory to implement (REQ), optional
30(OPT), and NFSv4.0 operations that are required not to implement (MNI)
31in minor version 1. The first column indicates the operations that
32are not supported yet by the linux server implementation.
33
34The OPTIONAL features identified and their abbreviations are as follows:
35 pNFS Parallel NFS
36 FDELG File Delegations
37 DDELG Directory Delegations
38
39The following abbreviations indicate the linux server implementation status.
40 I Implemented NFSv4.1 operations.
41 NS Not Supported.
42 NS* unimplemented optional feature.
43 P pNFS features implemented out of tree.
44 PNS pNFS features that are not supported yet (out of tree).
45
46Operations
47
48 +----------------------+------------+--------------+----------------+
49 | Operation | REQ, REC, | Feature | Definition |
50 | | OPT, or | (REQ, REC, | |
51 | | MNI | or OPT) | |
52 +----------------------+------------+--------------+----------------+
53 | ACCESS | REQ | | Section 18.1 |
54NS | BACKCHANNEL_CTL | REQ | | Section 18.33 |
55NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 |
56 | CLOSE | REQ | | Section 18.2 |
57 | COMMIT | REQ | | Section 18.3 |
58 | CREATE | REQ | | Section 18.4 |
59I | CREATE_SESSION | REQ | | Section 18.36 |
60NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 |
61 | DELEGRETURN | OPT | FDELG, | Section 18.6 |
62 | | | DDELG, pNFS | |
63 | | | (REQ) | |
64NS | DESTROY_CLIENTID | REQ | | Section 18.50 |
65I | DESTROY_SESSION | REQ | | Section 18.37 |
66I | EXCHANGE_ID | REQ | | Section 18.35 |
67NS | FREE_STATEID | REQ | | Section 18.38 |
68 | GETATTR | REQ | | Section 18.7 |
69P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
70P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
71 | GETFH | REQ | | Section 18.8 |
72NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
73P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
74P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
75P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
76 | LINK | OPT | | Section 18.9 |
77 | LOCK | REQ | | Section 18.10 |
78 | LOCKT | REQ | | Section 18.11 |
79 | LOCKU | REQ | | Section 18.12 |
80 | LOOKUP | REQ | | Section 18.13 |
81 | LOOKUPP | REQ | | Section 18.14 |
82 | NVERIFY | REQ | | Section 18.15 |
83 | OPEN | REQ | | Section 18.16 |
84NS*| OPENATTR | OPT | | Section 18.17 |
85 | OPEN_CONFIRM | MNI | | N/A |
86 | OPEN_DOWNGRADE | REQ | | Section 18.18 |
87 | PUTFH | REQ | | Section 18.19 |
88 | PUTPUBFH | REQ | | Section 18.20 |
89 | PUTROOTFH | REQ | | Section 18.21 |
90 | READ | REQ | | Section 18.22 |
91 | READDIR | REQ | | Section 18.23 |
92 | READLINK | OPT | | Section 18.24 |
93NS | RECLAIM_COMPLETE | REQ | | Section 18.51 |
94 | RELEASE_LOCKOWNER | MNI | | N/A |
95 | REMOVE | REQ | | Section 18.25 |
96 | RENAME | REQ | | Section 18.26 |
97 | RENEW | MNI | | N/A |
98 | RESTOREFH | REQ | | Section 18.27 |
99 | SAVEFH | REQ | | Section 18.28 |
100 | SECINFO | REQ | | Section 18.29 |
101NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, |
102 | | | layout (REQ) | Section 13.12 |
103I | SEQUENCE | REQ | | Section 18.46 |
104 | SETATTR | REQ | | Section 18.30 |
105 | SETCLIENTID | MNI | | N/A |
106 | SETCLIENTID_CONFIRM | MNI | | N/A |
107NS | SET_SSV | REQ | | Section 18.47 |
108NS | TEST_STATEID | REQ | | Section 18.48 |
109 | VERIFY | REQ | | Section 18.31 |
110NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 |
111 | WRITE | REQ | | Section 18.32 |
112
113Callback Operations
114
115 +-------------------------+-----------+-------------+---------------+
116 | Operation | REQ, REC, | Feature | Definition |
117 | | OPT, or | (REQ, REC, | |
118 | | MNI | or OPT) | |
119 +-------------------------+-----------+-------------+---------------+
120 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
121P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
122NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
123P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
124NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
125NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
126 | CB_RECALL | OPT | FDELG, | Section 20.2 |
127 | | | DDELG, pNFS | |
128 | | | (REQ) | |
129NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 |
130 | | | DDELG, pNFS | |
131 | | | (REQ) | |
132NS | CB_RECALL_SLOT | REQ | | Section 20.8 |
133NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 |
134 | | | (REQ) | |
135I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 |
136 | | | DDELG, pNFS | |
137 | | | (REQ) | |
138NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
139 | | | DDELG, pNFS | |
140 | | | (REQ) | |
141 +-------------------------+-----------+-------------+---------------+
142
143Implementation notes:
144
145EXCHANGE_ID:
146* only SP4_NONE state protection supported
147* implementation ids are ignored
148
149CREATE_SESSION:
150* backchannel attributes are ignored
151* backchannel security parameters are ignored
152
153SEQUENCE:
154* no support for dynamic slot table renegotiation (optional)
155
156nfsv4.1 COMPOUND rules:
157The following cases aren't supported yet:
158* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION,
159 DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
160* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
161
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt
new file mode 100644
index 000000000000..6d6db60d567d
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/design_notes.txt
@@ -0,0 +1,70 @@
1POHMELFS: Parallel Optimized Host Message Exchange Layered File System.
2
3 Evgeniy Polyakov <zbr@ioremap.net>
4
5Homepage: http://www.ioremap.net/projects/pohmelfs
6
7POHMELFS first began as a network filesystem with coherent local data and
8metadata caches but is now evolving into a parallel distributed filesystem.
9
10Main features of this FS include:
11 * Locally coherent cache for data and metadata with (potentially) byte-range locks.
12 Since all Linux filesystems lock the whole inode during writing, algorithm
13 is very simple and does not use byte-ranges, although they are sent in
14 locking messages.
15 * Completely async processing of all events except creation of hard and symbolic
16 links, and rename events.
17 Object creation and data reading and writing are processed asynchronously.
18 * Flexible object architecture optimized for network processing.
19 Ability to create long paths to objects and remove arbitrarily huge
20 directories with a single network command.
21 (like removing the whole kernel tree via a single network command).
22 * Very high performance.
23 * Fast and scalable multithreaded userspace server. Being in userspace it works
24 with any underlying filesystem and still is much faster than async in-kernel NFS one.
25 * Client is able to switch between different servers (if one goes down, client
26 automatically reconnects to second and so on).
27 * Transactions support. Full failover for all operations.
28 Resending transactions to different servers on timeout or error.
29 * Read request (data read, directory listing, lookup requests) balancing between multiple servers.
30 * Write requests are replicated to multiple servers and completed only when all of them are acked.
31 * Ability to add and/or remove servers from the working set at run-time.
32 * Strong authentification and possible data encryption in network channel.
33 * Extended attributes support.
34
35POHMELFS is based on transactions, which are potentially long-standing objects that live
36in the client's memory. Each transaction contains all the information needed to process a given
37command (or set of commands, which is frequently used during data writing: single transactions
38can contain creation and data writing commands). Transactions are committed by all the servers
39to which they are sent and, in case of failures, are eventually resent or dropped with an error.
40For example, reading will return an error if no servers are available.
41
42POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is
43possible to detach replies from requests and, if the command requires data to be received, the
44caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different
45servers and async threads will pick up replies in parallel, find appropriate transactions in the
46system and put the data where it belongs (like the page or inode cache).
47
48The main feature of POHMELFS is writeback data and the metadata cache.
49Only a few non-performance critical operations use the write-through cache and
50are synchronous: hard and symbolic link creation, and object rename. Creation,
51removal of objects and data writing are asynchronous and are sent to
52the server during system writeback. Only one writer at a time is allowed for any
53given inode, which is guarded by an appropriate locking protocol.
54Because of this feature, POHMELFS is extremely fast at metadata intensive
55workloads and can fully utilize the bandwidth to the servers when doing bulk
56data transfers.
57
58POHMELFS clients operate with a working set of servers and are capable of balancing read-only
59operations (like lookups or directory listings) between them.
60Administrators can add or remove servers from the set at run-time via special commands (described
61in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers.
62
63POHMELFS is capable of full data channel encryption and/or strong crypto hashing.
64One can select any kernel supported cipher, encryption mode, hash type and operation mode
65(hmac or digest). It is also possible to use both or neither (default). Crypto configuration
66is checked during mount time and, if the server does not support it, appropriate capabilities
67will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified).
68Crypto performance heavily depends on the number of crypto threads, which asynchronously perform
69crypto operations and send the resulting data to server or submit it up the stack. This number
70can be controlled via a mount option.
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt
new file mode 100644
index 000000000000..4e3d50157083
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/info.txt
@@ -0,0 +1,86 @@
1POHMELFS usage information.
2
3Mount options:
4idx=%u
5 Each mountpoint is associated with a special index via this option.
6 Administrator can add or remove servers from the given index, so all mounts,
7 which were attached to it, are updated.
8 Default it is 0.
9
10trans_scan_timeout=%u
11 This timeout, expressed in milliseconds, specifies time to scan transaction
12 trees looking for stale requests, which have to be resent, or if number of
13 retries exceed specified limit, dropped with error.
14 Default is 5 seconds.
15
16drop_scan_timeout=%u
17 Internal timeout, expressed in milliseconds, which specifies how frequently
18 inodes marked to be dropped are freed. It also specifies how frequently
19 the system checks that servers have to be added or removed from current working set.
20 Default is 1 second.
21
22wait_on_page_timeout=%u
23 Number of milliseconds to wait for reply from remote server for data reading command.
24 If this timeout is exceeded, reading returns an error.
25 Default is 5 seconds.
26
27trans_retries=%u
28 This is the number of times that a transaction will be resent to a server that did
29 not answer for the last @trans_scan_timeout milliseconds.
30 When the number of resends exceeds this limit, the transaction is completed with error.
31 Default is 5 resends.
32
33crypto_thread_num=%u
34 Number of crypto processing threads. Threads are used both for RX and TX traffic.
35 Default is 2, or no threads if crypto operations are not supported.
36
37trans_max_pages=%u
38 Maximum number of pages in a single transaction. This parameter also controls
39 the number of pages, allocated for crypto processing (each crypto thread has
40 pool of pages, the number of which is equal to 'trans_max_pages'.
41 Default is 100 pages.
42
43crypto_fail_unsupported
44 If specified, mount will fail if the server does not support requested crypto operations.
45 By default mount will disable non-matching crypto operations.
46
47mcache_timeout=%u
48 Maximum number of milliseconds to wait for the mcache objects to be processed.
49 Mcache includes locks (given lock should be granted by server), attributes (they should be
50 fully received in the given timeframe).
51 Default is 5 seconds.
52
53Usage examples.
54
55Add (or remove if it already exists) server server1.net:1025 into the working set with index $idx
56with appropriate hash algorithm and key file and cipher algorithm, mode and key file:
57$cfg -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key
58
59Mount filesystem with given index $idx to /mnt mountpoint.
60Client will connect to all servers specified in the working set via previous command:
61mount -t pohmel -o idx=$idx q /mnt
62
63One can add or remove servers from working set after mounting too.
64
65
66Server installation.
67
68Creating a server, which listens at port 1025 and 0.0.0.0 address.
69Working root directory (note, that server chroots there, so you have to have appropriate permissions)
70is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there
71are appropriate key files.
72Number of working threads is set to 10.
73
74# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key
75
76 -A 6 - listen on ipv6 address. Default: Disabled.
77 -r root - path to root directory. Default: /tmp.
78 -a addr - listen address. Default: 0.0.0.0.
79 -p port - listen port. Default: 1025.
80 -w workers - number of workers per connected client. Default: 1.
81 -K file - hash key size. Default: none.
82 -k file - cipher key size. Default: none.
83 -h - this help.
84
85Number of worker threads specifies how many workers will be created for each client.
86Bulk single-client transafers usually are better handled with smaller number (like 1-3).
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt
new file mode 100644
index 000000000000..40ea6c295afb
--- /dev/null
+++ b/Documentation/filesystems/pohmelfs/network_protocol.txt
@@ -0,0 +1,227 @@
1POHMELFS network protocol.
2
3Basic structure used in network communication is following command:
4
5struct netfs_cmd
6{
7 __u16 cmd; /* Command number */
8 __u16 csize; /* Attached crypto information size */
9 __u16 cpad; /* Attached padding size */
10 __u16 ext; /* External flags */
11 __u32 size; /* Size of the attached data */
12 __u32 trans; /* Transaction id */
13 __u64 id; /* Object ID to operate on. Used for feedback.*/
14 __u64 start; /* Start of the object. */
15 __u64 iv; /* IV sequence */
16 __u8 data[0];
17};
18
19Commands can be embedded into transaction command (which in turn has own command),
20so one can extend protocol as needed without breaking backward compatibility as long
21as old commands are supported. All string lengths include tail 0 byte.
22
23All commans are transfered over the network in big-endian. CPU endianess is used at the end peers.
24
25@cmd - command number, which specifies command to be processed. Following
26 commands are used currently:
27
28 NETFS_READDIR = 1, /* Read directory for given inode number */
29 NETFS_READ_PAGE, /* Read data page from the server */
30 NETFS_WRITE_PAGE, /* Write data page to the server */
31 NETFS_CREATE, /* Create directory entry */
32 NETFS_REMOVE, /* Remove directory entry */
33 NETFS_LOOKUP, /* Lookup single object */
34 NETFS_LINK, /* Create a link */
35 NETFS_TRANS, /* Transaction */
36 NETFS_OPEN, /* Open intent */
37 NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */
38 NETFS_PAGE_CACHE, /* Page cache invalidation message */
39 NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */
40 NETFS_RENAME, /* Rename object */
41 NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */
42 NETFS_LOCK, /* Distributed lock message */
43 NETFS_XATTR_SET, /* Set extended attribute */
44 NETFS_XATTR_GET, /* Get extended attribute */
45
46@ext - external flags. Used by different commands to specify some extra arguments
47 like partial size of the embedded objects or creation flags.
48
49@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached,
50 but size of the requested data is incorporated here. It does not include size of the command
51 header (struct netfs_cmd) itself.
52
53@id - id of the object this command operates on. Each command can use it for own purpose.
54
55@start - start of the object this command operates on. Each command can use it for own purpose.
56
57@csize, @cpad - size and padding size of the (attached if needed) crypto information.
58
59Command specifications.
60
61@NETFS_READDIR
62This command is used to sync content of the remote dir to the client.
63
64@ext - length of the path to object.
65@size - the same.
66@id - local inode number of the directory to read.
67@start - zero.
68
69
70@NETFS_READ_PAGE
71This command is used to read data from remote server.
72Data size does not exceed local page cache size.
73
74@id - inode number.
75@start - first byte offset.
76@size - number of bytes to read plus length of the path to object.
77@ext - object path length.
78
79
80@NETFS_CREATE
81Used to create object.
82It does not require that all directories on top of the object were
83already created, it will create them automatically. Each object has
84associated @netfs_path_entry data structure, which contains creation
85mode (permissions and type) and length of the name as long as name itself.
86
87@start - 0
88@size - size of the all data structures needed to create a path
89@id - local inode number
90@ext - 0
91
92
93@NETFS_REMOVE
94Used to remove object.
95
96@ext - length of the path to object.
97@size - the same.
98@id - local inode number.
99@start - zero.
100
101
102@NETFS_LOOKUP
103Lookup information about object on server.
104
105@ext - length of the path to object.
106@size - the same.
107@id - local inode number of the directory to look object in.
108@start - local inode number of the object to look at.
109
110
111@NETFS_LINK
112Create hard of symlink.
113Command is sent as "object_path|target_path".
114
115@size - size of the above string.
116@id - parent local inode number.
117@start - 1 for symlink, 0 for hardlink.
118@ext - size of the "object_path" above.
119
120
121@NETFS_TRANS
122Transaction header.
123
124@size - incorporates all embedded command sizes including theirs header sizes.
125@start - transaction generation number - unique id used to find transaction.
126@ext - transaction flags. Unused at the moment.
127@id - 0.
128
129
130@NETFS_OPEN
131Open intent for given transaction.
132
133@id - local inode number.
134@start - 0.
135@size - path length to the object.
136@ext - open flags (O_RDWR and so on).
137
138
139@NETFS_INODE_INFO
140Metadata update command.
141It is sent to servers when attributes of the object are changed and received
142when data or metadata were updated. It operates with the following structure:
143
144struct netfs_inode_info
145{
146 unsigned int mode;
147 unsigned int nlink;
148 unsigned int uid;
149 unsigned int gid;
150 unsigned int blocksize;
151 unsigned int padding;
152 __u64 ino;
153 __u64 blocks;
154 __u64 rdev;
155 __u64 size;
156 __u64 version;
157};
158
159It effectively mirrors stat(2) returned data.
160
161
162@ext - path length to the object.
163@size - the same plus size of the netfs_inode_info structure.
164@id - local inode number.
165@start - 0.
166
167
168@NETFS_PAGE_CACHE
169Command is only received by clients. It contains information about
170page to be marked as not up-to-date.
171
172@id - client's inode number.
173@start - last byte of the page to be invalidated. If it is not equal to
174 current inode size, it will be vmtruncated().
175@size - 0
176@ext - 0
177
178
179@NETFS_READ_PAGES
180Used to read multiple contiguous pages in one go.
181
182@start - first byte of the contiguous region to read.
183@size - contains of two fields: lower 8 bits are used to represent page cache shift
184 used by client, another 3 bytes are used to get number of pages.
185@id - local inode number.
186@ext - path length to the object.
187
188
189@NETFS_RENAME
190Used to rename object.
191Attached data is formed into following string: "old_path|new_path".
192
193@id - local inode number.
194@start - parent inode number.
195@size - length of the above string.
196@ext - length of the old path part.
197
198
199@NETFS_CAPABILITIES
200Used to exchange crypto capabilities with server.
201If crypto capabilities are not supported by server, then client will disable it
202or fail (if 'crypto_fail_unsupported' mount options was specified).
203
204@id - superblock index. Used to specify crypto information for group of servers.
205@size - size of the attached capabilities structure.
206@start - 0.
207@size - 0.
208@scsize - 0.
209
210@NETFS_LOCK
211Used to send lock request/release messages. Although it sends byte range request
212and is capable of flushing pages based on that, it is not used, since all Linux
213filesystems lock the whole inode.
214
215@id - lock generation number.
216@start - start of the locked range.
217@size - size of the locked range.
218@ext - lock type: read/write. Not used actually. 15'th bit is used to determine,
219 if it is lock request (1) or release (0).
220
221@NETFS_XATTR_SET
222@NETFS_XATTR_GET
223Used to set/get extended attributes for given inode.
224@id - attribute generation number or xattr setting type
225@start - size of the attribute (request or attached)
226@size - name length, path len and data size for given attribute
227@ext - path length for given object
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0f..ce84cfc9eae0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -5,6 +5,7 @@
5 Bodo Bauer <bb@ricochet.net> 5 Bodo Bauer <bb@ricochet.net>
6 6
72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000
8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009
8------------------------------------------------------------------------------ 9------------------------------------------------------------------------------
9Version 1.3 Kernel version 2.2.12 10Version 1.3 Kernel version 2.2.12
10 Kernel version 2.4.0-test11-pre4 11 Kernel version 2.4.0-test11-pre4
@@ -26,25 +27,17 @@ Table of Contents
26 1.6 Parallel port info in /proc/parport 27 1.6 Parallel port info in /proc/parport
27 1.7 TTY info in /proc/tty 28 1.7 TTY info in /proc/tty
28 1.8 Miscellaneous kernel statistics in /proc/stat 29 1.8 Miscellaneous kernel statistics in /proc/stat
30 1.9 Ext4 file system parameters
29 31
30 2 Modifying System Parameters 32 2 Modifying System Parameters
31 2.1 /proc/sys/fs - File system data 33
32 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats 34 3 Per-Process Parameters
33 2.3 /proc/sys/kernel - general kernel parameters 35 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
34 2.4 /proc/sys/vm - The virtual memory subsystem 36 3.2 /proc/<pid>/oom_score - Display current oom-killer score
35 2.5 /proc/sys/dev - Device specific parameters 37 3.3 /proc/<pid>/io - Display the IO accounting fields
36 2.6 /proc/sys/sunrpc - Remote procedure calls 38 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
37 2.7 /proc/sys/net - Networking stuff 39 3.5 /proc/<pid>/mountinfo - Information about mounts
38 2.8 /proc/sys/net/ipv4 - IPV4 settings 40
39 2.9 Appletalk
40 2.10 IPX
41 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
42 2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score
43 2.13 /proc/<pid>/oom_score - Display current oom-killer score
44 2.14 /proc/<pid>/io - Display the IO accounting fields
45 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
46 2.16 /proc/<pid>/mountinfo - Information about mounts
47 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
48 41
49------------------------------------------------------------------------------ 42------------------------------------------------------------------------------
50Preface 43Preface
@@ -940,27 +933,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
940 File Content 933 File Content
941 mb_groups details of multiblock allocator buddy cache of free blocks 934 mb_groups details of multiblock allocator buddy cache of free blocks
942 mb_history multiblock allocation history 935 mb_history multiblock allocation history
943 stats controls whether the multiblock allocator should start
944 collecting statistics, which are shown during the unmount
945 group_prealloc the multiblock allocator will round up allocation
946 requests to a multiple of this tuning parameter if the
947 stripe size is not set in the ext4 superblock
948 max_to_scan The maximum number of extents the multiblock allocator
949 will search to find the best extent
950 min_to_scan The minimum number of extents the multiblock allocator
951 will search to find the best extent
952 order2_req Tuning parameter which controls the minimum size for
953 requests (as a power of 2) where the buddy cache is
954 used
955 stream_req Files which have fewer blocks than this tunable
956 parameter will have their blocks allocated out of a
957 block group specific preallocation pool, so that small
958 files are packed closely together. Each large file
959 will have its blocks allocated out of its own unique
960 preallocation pool.
961inode_readahead Tuning parameter which controls the maximum number of
962 inode table blocks that ext4's inode table readahead
963 algorithm will pre-read into the buffer cache
964.............................................................................. 936..............................................................................
965 937
966 938
@@ -1011,1021 +983,24 @@ review the kernel documentation in the directory /usr/src/linux/Documentation.
1011This chapter is heavily based on the documentation included in the pre 2.2 983This chapter is heavily based on the documentation included in the pre 2.2
1012kernels, and became part of it in version 2.2.1 of the Linux kernel. 984kernels, and became part of it in version 2.2.1 of the Linux kernel.
1013 985
10142.1 /proc/sys/fs - File system data 986Please see: Documentation/sysctls/ directory for descriptions of these
1015-----------------------------------
1016
1017This subdirectory contains specific file system, file handle, inode, dentry
1018and quota information.
1019
1020Currently, these files are in /proc/sys/fs:
1021
1022dentry-state
1023------------
1024
1025Status of the directory cache. Since directory entries are dynamically
1026allocated and deallocated, this file indicates the current status. It holds
1027six values, in which the last two are not used and are always zero. The others
1028are listed in table 2-1.
1029
1030
1031Table 2-1: Status files of the directory cache
1032..............................................................................
1033 File Content
1034 nr_dentry Almost always zero
1035 nr_unused Number of unused cache entries
1036 age_limit
1037 in seconds after the entry may be reclaimed, when memory is short
1038 want_pages internally
1039..............................................................................
1040
1041dquot-nr and dquot-max
1042----------------------
1043
1044The file dquot-max shows the maximum number of cached disk quota entries.
1045
1046The file dquot-nr shows the number of allocated disk quota entries and the
1047number of free disk quota entries.
1048
1049If the number of available cached disk quotas is very low and you have a large
1050number of simultaneous system users, you might want to raise the limit.
1051
1052file-nr and file-max
1053--------------------
1054
1055The kernel allocates file handles dynamically, but doesn't free them again at
1056this time.
1057
1058The value in file-max denotes the maximum number of file handles that the
1059Linux kernel will allocate. When you get a lot of error messages about running
1060out of file handles, you might want to raise this limit. The default value is
106110% of RAM in kilobytes. To change it, just write the new number into the
1062file:
1063
1064 # cat /proc/sys/fs/file-max
1065 4096
1066 # echo 8192 > /proc/sys/fs/file-max
1067 # cat /proc/sys/fs/file-max
1068 8192
1069
1070
1071This method of revision is useful for all customizable parameters of the
1072kernel - simply echo the new value to the corresponding file.
1073
1074Historically, the three values in file-nr denoted the number of allocated file
1075handles, the number of allocated but unused file handles, and the maximum
1076number of file handles. Linux 2.6 always reports 0 as the number of free file
1077handles -- this is not an error, it just means that the number of allocated
1078file handles exactly matches the number of used file handles.
1079
1080Attempts to allocate more file descriptors than file-max are reported with
1081printk, look for "VFS: file-max limit <number> reached".
1082
1083inode-state and inode-nr
1084------------------------
1085
1086The file inode-nr contains the first two items from inode-state, so we'll skip
1087to that file...
1088
1089inode-state contains two actual numbers and five dummy values. The numbers
1090are nr_inodes and nr_free_inodes (in order of appearance).
1091
1092nr_inodes
1093~~~~~~~~~
1094
1095Denotes the number of inodes the system has allocated. This number will
1096grow and shrink dynamically.
1097
1098nr_open
1099-------
1100
1101Denotes the maximum number of file-handles a process can
1102allocate. Default value is 1024*1024 (1048576) which should be
1103enough for most machines. Actual limit depends on RLIMIT_NOFILE
1104resource limit.
1105
1106nr_free_inodes
1107--------------
1108
1109Represents the number of free inodes. Ie. The number of inuse inodes is
1110(nr_inodes - nr_free_inodes).
1111
1112aio-nr and aio-max-nr
1113---------------------
1114
1115aio-nr is the running total of the number of events specified on the
1116io_setup system call for all currently active aio contexts. If aio-nr
1117reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
1118raising aio-max-nr does not result in the pre-allocation or re-sizing
1119of any kernel data structures.
1120
11212.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
1122-----------------------------------------------------------
1123
1124Besides these files, there is the subdirectory /proc/sys/fs/binfmt_misc. This
1125handles the kernel support for miscellaneous binary formats.
1126
1127Binfmt_misc provides the ability to register additional binary formats to the
1128Kernel without compiling an additional module/kernel. Therefore, binfmt_misc
1129needs to know magic numbers at the beginning or the filename extension of the
1130binary.
1131
1132It works by maintaining a linked list of structs that contain a description of
1133a binary format, including a magic with size (or the filename extension),
1134offset and mask, and the interpreter name. On request it invokes the given
1135interpreter with the original program as argument, as binfmt_java and
1136binfmt_em86 and binfmt_mz do. Since binfmt_misc does not define any default
1137binary-formats, you have to register an additional binary-format.
1138
1139There are two general files in binfmt_misc and one file per registered format.
1140The two general files are register and status.
1141
1142Registering a new binary format
1143-------------------------------
1144
1145To register a new binary format you have to issue the command
1146
1147 echo :name:type:offset:magic:mask:interpreter: > /proc/sys/fs/binfmt_misc/register
1148
1149
1150
1151with appropriate name (the name for the /proc-dir entry), offset (defaults to
11520, if omitted), magic, mask (which can be omitted, defaults to all 0xff) and
1153last but not least, the interpreter that is to be invoked (for example and
1154testing /bin/echo). Type can be M for usual magic matching or E for filename
1155extension matching (give extension in place of magic).
1156
1157Check or reset the status of the binary format handler
1158------------------------------------------------------
1159
1160If you do a cat on the file /proc/sys/fs/binfmt_misc/status, you will get the
1161current status (enabled/disabled) of binfmt_misc. Change the status by echoing
11620 (disables) or 1 (enables) or -1 (caution: this clears all previously
1163registered binary formats) to status. For example echo 0 > status to disable
1164binfmt_misc (temporarily).
1165
1166Status of a single handler
1167--------------------------
1168
1169Each registered handler has an entry in /proc/sys/fs/binfmt_misc. These files
1170perform the same function as status, but their scope is limited to the actual
1171binary format. By cating this file, you also receive all related information
1172about the interpreter/magic of the binfmt.
1173
1174Example usage of binfmt_misc (emulate binfmt_java)
1175--------------------------------------------------
1176
1177 cd /proc/sys/fs/binfmt_misc
1178 echo ':Java:M::\xca\xfe\xba\xbe::/usr/local/java/bin/javawrapper:' > register
1179 echo ':HTML:E::html::/usr/local/java/bin/appletviewer:' > register
1180 echo ':Applet:M::<!--applet::/usr/local/java/bin/appletviewer:' > register
1181 echo ':DEXE:M::\x0eDEX::/usr/bin/dosexec:' > register
1182
1183
1184These four lines add support for Java executables and Java applets (like
1185binfmt_java, additionally recognizing the .html extension with no need to put
1186<!--applet> to every applet file). You have to install the JDK and the
1187shell-script /usr/local/java/bin/javawrapper too. It works around the
1188brokenness of the Java filename handling. To add a Java binary, just create a
1189link to the class-file somewhere in the path.
1190
11912.3 /proc/sys/kernel - general kernel parameters
1192------------------------------------------------
1193
1194This directory reflects general kernel behaviors. As I've said before, the
1195contents depend on your configuration. Here you'll find the most important
1196files, along with descriptions of what they mean and how to use them.
1197
1198acct
1199----
1200
1201The file contains three values; highwater, lowwater, and frequency.
1202
1203It exists only when BSD-style process accounting is enabled. These values
1204control its behavior. If the free space on the file system where the log lives
1205goes below lowwater percentage, accounting suspends. If it goes above
1206highwater percentage, accounting resumes. Frequency determines how often you
1207check the amount of free space (value is in seconds). Default settings are: 4,
12082, and 30. That is, suspend accounting if there is less than 2 percent free;
1209resume it if we have a value of 3 or more percent; consider information about
1210the amount of free space valid for 30 seconds
1211
1212ctrl-alt-del
1213------------
1214
1215When the value in this file is 0, ctrl-alt-del is trapped and sent to the init
1216program to handle a graceful restart. However, when the value is greater that
1217zero, Linux's reaction to this key combination will be an immediate reboot,
1218without syncing its dirty buffers.
1219
1220[NOTE]
1221 When a program (like dosemu) has the keyboard in raw mode, the
1222 ctrl-alt-del is intercepted by the program before it ever reaches the
1223 kernel tty layer, and it is up to the program to decide what to do with
1224 it.
1225
1226domainname and hostname
1227-----------------------
1228
1229These files can be controlled to set the NIS domainname and hostname of your
1230box. For the classic darkstar.frop.org a simple:
1231
1232 # echo "darkstar" > /proc/sys/kernel/hostname
1233 # echo "frop.org" > /proc/sys/kernel/domainname
1234
1235
1236would suffice to set your hostname and NIS domainname.
1237
1238osrelease, ostype and version
1239-----------------------------
1240
1241The names make it pretty obvious what these fields contain:
1242
1243 > cat /proc/sys/kernel/osrelease
1244 2.2.12
1245
1246 > cat /proc/sys/kernel/ostype
1247 Linux
1248
1249 > cat /proc/sys/kernel/version
1250 #4 Fri Oct 1 12:41:14 PDT 1999
1251
1252
1253The files osrelease and ostype should be clear enough. Version needs a little
1254more clarification. The #4 means that this is the 4th kernel built from this
1255source base and the date after it indicates the time the kernel was built. The
1256only way to tune these values is to rebuild the kernel.
1257
1258panic
1259-----
1260
1261The value in this file represents the number of seconds the kernel waits
1262before rebooting on a panic. When you use the software watchdog, the
1263recommended setting is 60. If set to 0, the auto reboot after a kernel panic
1264is disabled, which is the default setting.
1265
1266printk
1267------
1268
1269The four values in printk denote
1270* console_loglevel,
1271* default_message_loglevel,
1272* minimum_console_loglevel and
1273* default_console_loglevel
1274respectively.
1275
1276These values influence printk() behavior when printing or logging error
1277messages, which come from inside the kernel. See syslog(2) for more
1278information on the different log levels.
1279
1280console_loglevel
1281----------------
1282
1283Messages with a higher priority than this will be printed to the console.
1284
1285default_message_level
1286---------------------
1287
1288Messages without an explicit priority will be printed with this priority.
1289
1290minimum_console_loglevel
1291------------------------
1292
1293Minimum (highest) value to which the console_loglevel can be set.
1294
1295default_console_loglevel
1296------------------------
1297
1298Default value for console_loglevel.
1299
1300sg-big-buff
1301-----------
1302
1303This file shows the size of the generic SCSI (sg) buffer. At this point, you
1304can't tune it yet, but you can change it at compile time by editing
1305include/scsi/sg.h and changing the value of SG_BIG_BUFF.
1306
1307If you use a scanner with SANE (Scanner Access Now Easy) you might want to set
1308this to a higher value. Refer to the SANE documentation on this issue.
1309
1310modprobe
1311--------
1312
1313The location where the modprobe binary is located. The kernel uses this
1314program to load modules on demand.
1315
1316unknown_nmi_panic
1317-----------------
1318
1319The value in this file affects behavior of handling NMI. When the value is
1320non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
1321debugging information is displayed on console.
1322
1323NMI switch that most IA32 servers have fires unknown NMI up, for example.
1324If a system hangs up, try pressing the NMI switch.
1325
1326panic_on_unrecovered_nmi
1327------------------------
1328
1329The default Linux behaviour on an NMI of either memory or unknown is to continue
1330operation. For many environments such as scientific computing it is preferable
1331that the box is taken out and the error dealt with than an uncorrected
1332parity/ECC error get propogated.
1333
1334A small number of systems do generate NMI's for bizarre random reasons such as
1335power management so the default is off. That sysctl works like the existing
1336panic controls already in that directory.
1337
1338nmi_watchdog
1339------------
1340
1341Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
1342the NMI watchdog is enabled and will continuously test all online cpus to
1343determine whether or not they are still functioning properly. Currently,
1344passing "nmi_watchdog=" parameter at boot time is required for this function
1345to work.
1346
1347If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the
1348NMI watchdog shares registers with oprofile. By disabling the NMI watchdog,
1349oprofile may have more registers to utilize.
1350
1351msgmni
1352------
1353
1354Maximum number of message queue ids on the system.
1355This value scales to the amount of lowmem. It is automatically recomputed
1356upon memory add/remove or ipc namespace creation/removal.
1357When a value is written into this file, msgmni's value becomes fixed, i.e. it
1358is not recomputed anymore when one of the above events occurs.
1359Use auto_msgmni to change this behavior.
1360
1361auto_msgmni
1362-----------
1363
1364Enables/Disables automatic recomputing of msgmni upon memory add/remove or
1365upon ipc namespace creation/removal (see the msgmni description above).
1366Echoing "1" into this file enables msgmni automatic recomputing.
1367Echoing "0" turns it off.
1368auto_msgmni default value is 1.
1369
1370
13712.4 /proc/sys/vm - The virtual memory subsystem
1372-----------------------------------------------
1373
1374Please see: Documentation/sysctls/vm.txt for a description of these
1375entries. 987entries.
1376 988
989------------------------------------------------------------------------------
990Summary
991------------------------------------------------------------------------------
992Certain aspects of kernel behavior can be modified at runtime, without the
993need to recompile the kernel, or even to reboot the system. The files in the
994/proc/sys tree can not only be read, but also modified. You can use the echo
995command to write value into these files, thereby changing the default settings
996of the kernel.
997------------------------------------------------------------------------------
1377 998
13782.5 /proc/sys/dev - Device specific parameters 999------------------------------------------------------------------------------
1379---------------------------------------------- 1000CHAPTER 3: PER-PROCESS PARAMETERS
1380 1001------------------------------------------------------------------------------
1381Currently there is only support for CDROM drives, and for those, there is only
1382one read-only file containing information about the CD-ROM drives attached to
1383the system:
1384
1385 >cat /proc/sys/dev/cdrom/info
1386 CD-ROM information, Id: cdrom.c 2.55 1999/04/25
1387
1388 drive name: sr0 hdb
1389 drive speed: 32 40
1390 drive # of slots: 1 0
1391 Can close tray: 1 1
1392 Can open tray: 1 1
1393 Can lock tray: 1 1
1394 Can change speed: 1 1
1395 Can select disk: 0 1
1396 Can read multisession: 1 1
1397 Can read MCN: 1 1
1398 Reports media changed: 1 1
1399 Can play audio: 1 1
1400
1401
1402You see two drives, sr0 and hdb, along with a list of their features.
1403
14042.6 /proc/sys/sunrpc - Remote procedure calls
1405---------------------------------------------
1406
1407This directory contains four files, which enable or disable debugging for the
1408RPC functions NFS, NFS-daemon, RPC and NLM. The default values are 0. They can
1409be set to one to turn debugging on. (The default value is 0 for each)
1410
14112.7 /proc/sys/net - Networking stuff
1412------------------------------------
1413
1414The interface to the networking parts of the kernel is located in
1415/proc/sys/net. Table 2-3 shows all possible subdirectories. You may see only
1416some of them, depending on your kernel's configuration.
1417
1418
1419Table 2-3: Subdirectories in /proc/sys/net
1420..............................................................................
1421 Directory Content Directory Content
1422 core General parameter appletalk Appletalk protocol
1423 unix Unix domain sockets netrom NET/ROM
1424 802 E802 protocol ax25 AX25
1425 ethernet Ethernet protocol rose X.25 PLP layer
1426 ipv4 IP version 4 x25 X.25 protocol
1427 ipx IPX token-ring IBM token ring
1428 bridge Bridging decnet DEC net
1429 ipv6 IP version 6
1430..............................................................................
1431
1432We will concentrate on IP networking here. Since AX15, X.25, and DEC Net are
1433only minor players in the Linux world, we'll skip them in this chapter. You'll
1434find some short info on Appletalk and IPX further on in this chapter. Review
1435the online documentation and the kernel source to get a detailed view of the
1436parameters for those protocols. In this section we'll discuss the
1437subdirectories printed in bold letters in the table above. As default values
1438are suitable for most needs, there is no need to change these values.
1439
1440/proc/sys/net/core - Network core options
1441-----------------------------------------
1442
1443rmem_default
1444------------
1445
1446The default setting of the socket receive buffer in bytes.
1447
1448rmem_max
1449--------
1450
1451The maximum receive socket buffer size in bytes.
1452
1453wmem_default
1454------------
1455
1456The default setting (in bytes) of the socket send buffer.
1457
1458wmem_max
1459--------
1460
1461The maximum send socket buffer size in bytes.
1462
1463message_burst and message_cost
1464------------------------------
1465
1466These parameters are used to limit the warning messages written to the kernel
1467log from the networking code. They enforce a rate limit to make a
1468denial-of-service attack impossible. A higher message_cost factor, results in
1469fewer messages that will be written. Message_burst controls when messages will
1470be dropped. The default settings limit warning messages to one every five
1471seconds.
1472
1473warnings
1474--------
1475
1476This controls console messages from the networking stack that can occur because
1477of problems on the network like duplicate address or bad checksums. Normally,
1478this should be enabled, but if the problem persists the messages can be
1479disabled.
1480
1481netdev_budget
1482-------------
1483
1484Maximum number of packets taken from all interfaces in one polling cycle (NAPI
1485poll). In one polling cycle interfaces which are registered to polling are
1486probed in a round-robin manner. The limit of packets in one such probe can be
1487set per-device via sysfs class/net/<device>/weight .
1488
1489netdev_max_backlog
1490------------------
1491
1492Maximum number of packets, queued on the INPUT side, when the interface
1493receives packets faster than kernel can process them.
1494
1495optmem_max
1496----------
1497
1498Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
1499of struct cmsghdr structures with appended data.
1500
1501/proc/sys/net/unix - Parameters for Unix domain sockets
1502-------------------------------------------------------
1503
1504There are only two files in this subdirectory. They control the delays for
1505deleting and destroying socket descriptors.
1506
15072.8 /proc/sys/net/ipv4 - IPV4 settings
1508--------------------------------------
1509
1510IP version 4 is still the most used protocol in Unix networking. It will be
1511replaced by IP version 6 in the next couple of years, but for the moment it's
1512the de facto standard for the internet and is used in most networking
1513environments around the world. Because of the importance of this protocol,
1514we'll have a deeper look into the subtree controlling the behavior of the IPv4
1515subsystem of the Linux kernel.
1516
1517Let's start with the entries in /proc/sys/net/ipv4.
1518
1519ICMP settings
1520-------------
1521
1522icmp_echo_ignore_all and icmp_echo_ignore_broadcasts
1523----------------------------------------------------
1524
1525Turn on (1) or off (0), if the kernel should ignore all ICMP ECHO requests, or
1526just those to broadcast and multicast addresses.
1527
1528Please note that if you accept ICMP echo requests with a broadcast/multi\-cast
1529destination address your network may be used as an exploder for denial of
1530service packet flooding attacks to other hosts.
1531
1532icmp_destunreach_rate, icmp_echoreply_rate, icmp_paramprob_rate and icmp_timeexeed_rate
1533---------------------------------------------------------------------------------------
1534
1535Sets limits for sending ICMP packets to specific targets. A value of zero
1536disables all limiting. Any positive value sets the maximum package rate in
1537hundredth of a second (on Intel systems).
1538
1539IP settings
1540-----------
1541
1542ip_autoconfig
1543-------------
1544
1545This file contains the number one if the host received its IP configuration by
1546RARP, BOOTP, DHCP or a similar mechanism. Otherwise it is zero.
1547
1548ip_default_ttl
1549--------------
1550
1551TTL (Time To Live) for IPv4 interfaces. This is simply the maximum number of
1552hops a packet may travel.
1553
1554ip_dynaddr
1555----------
1556
1557Enable dynamic socket address rewriting on interface address change. This is
1558useful for dialup interface with changing IP addresses.
1559
1560ip_forward
1561----------
1562
1563Enable or disable forwarding of IP packages between interfaces. Changing this
1564value resets all other parameters to their default values. They differ if the
1565kernel is configured as host or router.
1566
1567ip_local_port_range
1568-------------------
1569
1570Range of ports used by TCP and UDP to choose the local port. Contains two
1571numbers, the first number is the lowest port, the second number the highest
1572local port. Default is 1024-4999. Should be changed to 32768-61000 for
1573high-usage systems.
1574
1575ip_no_pmtu_disc
1576---------------
1577
1578Global switch to turn path MTU discovery off. It can also be set on a per
1579socket basis by the applications or on a per route basis.
1580
1581ip_masq_debug
1582-------------
1583
1584Enable/disable debugging of IP masquerading.
1585
1586IP fragmentation settings
1587-------------------------
1588
1589ipfrag_high_trash and ipfrag_low_trash
1590--------------------------------------
1591
1592Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh bytes
1593of memory is allocated for this purpose, the fragment handler will toss
1594packets until ipfrag_low_thresh is reached.
1595
1596ipfrag_time
1597-----------
1598
1599Time in seconds to keep an IP fragment in memory.
1600
1601TCP settings
1602------------
1603
1604tcp_ecn
1605-------
1606
1607This file controls the use of the ECN bit in the IPv4 headers. This is a new
1608feature about Explicit Congestion Notification, but some routers and firewalls
1609block traffic that has this bit set, so it could be necessary to echo 0 to
1610/proc/sys/net/ipv4/tcp_ecn if you want to talk to these sites. For more info
1611you could read RFC2481.
1612
1613tcp_retrans_collapse
1614--------------------
1615
1616Bug-to-bug compatibility with some broken printers. On retransmit, try to send
1617larger packets to work around bugs in certain TCP stacks. Can be turned off by
1618setting it to zero.
1619
1620tcp_keepalive_probes
1621--------------------
1622
1623Number of keep alive probes TCP sends out, until it decides that the
1624connection is broken.
1625
1626tcp_keepalive_time
1627------------------
1628
1629How often TCP sends out keep alive messages, when keep alive is enabled. The
1630default is 2 hours.
1631
1632tcp_syn_retries
1633---------------
1634
1635Number of times initial SYNs for a TCP connection attempt will be
1636retransmitted. Should not be higher than 255. This is only the timeout for
1637outgoing connections, for incoming connections the number of retransmits is
1638defined by tcp_retries1.
1639
1640tcp_sack
1641--------
1642
1643Enable select acknowledgments after RFC2018.
1644
1645tcp_timestamps
1646--------------
1647
1648Enable timestamps as defined in RFC1323.
1649
1650tcp_stdurg
1651----------
1652
1653Enable the strict RFC793 interpretation of the TCP urgent pointer field. The
1654default is to use the BSD compatible interpretation of the urgent pointer
1655pointing to the first byte after the urgent data. The RFC793 interpretation is
1656to have it point to the last byte of urgent data. Enabling this option may
1657lead to interoperability problems. Disabled by default.
1658
1659tcp_syncookies
1660--------------
1661
1662Only valid when the kernel was compiled with CONFIG_SYNCOOKIES. Send out
1663syncookies when the syn backlog queue of a socket overflows. This is to ward
1664off the common 'syn flood attack'. Disabled by default.
1665
1666Note that the concept of a socket backlog is abandoned. This means the peer
1667may not receive reliable error messages from an over loaded server with
1668syncookies enabled.
1669
1670tcp_window_scaling
1671------------------
1672
1673Enable window scaling as defined in RFC1323.
1674
1675tcp_fin_timeout
1676---------------
1677
1678The length of time in seconds it takes to receive a final FIN before the
1679socket is always closed. This is strictly a violation of the TCP
1680specification, but required to prevent denial-of-service attacks.
1681
1682tcp_max_ka_probes
1683-----------------
1684
1685Indicates how many keep alive probes are sent per slow timer run. Should not
1686be set too high to prevent bursts.
1687
1688tcp_max_syn_backlog
1689-------------------
1690
1691Length of the per socket backlog queue. Since Linux 2.2 the backlog specified
1692in listen(2) only specifies the length of the backlog queue of already
1693established sockets. When more connection requests arrive Linux starts to drop
1694packets. When syncookies are enabled the packets are still answered and the
1695maximum queue is effectively ignored.
1696
1697tcp_retries1
1698------------
1699
1700Defines how often an answer to a TCP connection request is retransmitted
1701before giving up.
1702
1703tcp_retries2
1704------------
1705
1706Defines how often a TCP packet is retransmitted before giving up.
1707
1708Interface specific settings
1709---------------------------
1710
1711In the directory /proc/sys/net/ipv4/conf you'll find one subdirectory for each
1712interface the system knows about and one directory calls all. Changes in the
1713all subdirectory affect all interfaces, whereas changes in the other
1714subdirectories affect only one interface. All directories have the same
1715entries:
1716
1717accept_redirects
1718----------------
1719
1720This switch decides if the kernel accepts ICMP redirect messages or not. The
1721default is 'yes' if the kernel is configured for a regular host and 'no' for a
1722router configuration.
1723
1724accept_source_route
1725-------------------
1726
1727Should source routed packages be accepted or declined. The default is
1728dependent on the kernel configuration. It's 'yes' for routers and 'no' for
1729hosts.
1730
1731bootp_relay
1732~~~~~~~~~~~
1733
1734Accept packets with source address 0.b.c.d with destinations not to this host
1735as local ones. It is supposed that a BOOTP relay daemon will catch and forward
1736such packets.
1737
1738The default is 0, since this feature is not implemented yet (kernel version
17392.2.12).
1740
1741forwarding
1742----------
1743
1744Enable or disable IP forwarding on this interface.
1745
1746log_martians
1747------------
1748
1749Log packets with source addresses with no known route to kernel log.
1750
1751mc_forwarding
1752-------------
1753
1754Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE and a
1755multicast routing daemon is required.
1756
1757proxy_arp
1758---------
1759
1760Does (1) or does not (0) perform proxy ARP.
1761
1762rp_filter
1763---------
1764
1765Integer value determines if a source validation should be made. 1 means yes, 0
1766means no. Disabled by default, but local/broadcast address spoofing is always
1767on.
1768
1769If you set this to 1 on a router that is the only connection for a network to
1770the net, it will prevent spoofing attacks against your internal networks
1771(external addresses can still be spoofed), without the need for additional
1772firewall rules.
1773
1774secure_redirects
1775----------------
1776
1777Accept ICMP redirect messages only for gateways, listed in default gateway
1778list. Enabled by default.
1779
1780shared_media
1781------------
1782
1783If it is not set the kernel does not assume that different subnets on this
1784device can communicate directly. Default setting is 'yes'.
1785
1786send_redirects
1787--------------
1788
1789Determines whether to send ICMP redirects to other hosts.
1790
1791Routing settings
1792----------------
1793
1794The directory /proc/sys/net/ipv4/route contains several file to control
1795routing issues.
1796
1797error_burst and error_cost
1798--------------------------
1799
1800These parameters are used to limit how many ICMP destination unreachable to
1801send from the host in question. ICMP destination unreachable messages are
1802sent when we cannot reach the next hop while trying to transmit a packet.
1803It will also print some error messages to kernel logs if someone is ignoring
1804our ICMP redirects. The higher the error_cost factor is, the fewer
1805destination unreachable and error messages will be let through. Error_burst
1806controls when destination unreachable messages and error messages will be
1807dropped. The default settings limit warning messages to five every second.
1808
1809flush
1810-----
1811
1812Writing to this file results in a flush of the routing cache.
1813
1814gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh
1815---------------------------------------------------------------------
1816
1817Values to control the frequency and behavior of the garbage collection
1818algorithm for the routing cache. gc_min_interval is deprecated and replaced
1819by gc_min_interval_ms.
1820
1821
1822max_size
1823--------
1824
1825Maximum size of the routing cache. Old entries will be purged once the cache
1826reached has this size.
1827
1828redirect_load, redirect_number
1829------------------------------
1830
1831Factors which determine if more ICPM redirects should be sent to a specific
1832host. No redirects will be sent once the load limit or the maximum number of
1833redirects has been reached.
1834
1835redirect_silence
1836----------------
1837
1838Timeout for redirects. After this period redirects will be sent again, even if
1839this has been stopped, because the load or number limit has been reached.
1840
1841Network Neighbor handling
1842-------------------------
1843
1844Settings about how to handle connections with direct neighbors (nodes attached
1845to the same link) can be found in the directory /proc/sys/net/ipv4/neigh.
1846
1847As we saw it in the conf directory, there is a default subdirectory which
1848holds the default values, and one directory for each interface. The contents
1849of the directories are identical, with the single exception that the default
1850settings contain additional options to set garbage collection parameters.
1851
1852In the interface directories you'll find the following entries:
1853
1854base_reachable_time, base_reachable_time_ms
1855-------------------------------------------
1856
1857A base value used for computing the random reachable time value as specified
1858in RFC2461.
1859
1860Expression of base_reachable_time, which is deprecated, is in seconds.
1861Expression of base_reachable_time_ms is in milliseconds.
1862
1863retrans_time, retrans_time_ms
1864-----------------------------
1865
1866The time between retransmitted Neighbor Solicitation messages.
1867Used for address resolution and to determine if a neighbor is
1868unreachable.
1869
1870Expression of retrans_time, which is deprecated, is in 1/100 seconds (for
1871IPv4) or in jiffies (for IPv6).
1872Expression of retrans_time_ms is in milliseconds.
1873
1874unres_qlen
1875----------
1876
1877Maximum queue length for a pending arp request - the number of packets which
1878are accepted from other layers while the ARP address is still resolved.
1879
1880anycast_delay
1881-------------
1882
1883Maximum for random delay of answers to neighbor solicitation messages in
1884jiffies (1/100 sec). Not yet implemented (Linux does not have anycast support
1885yet).
1886
1887ucast_solicit
1888-------------
1889
1890Maximum number of retries for unicast solicitation.
1891
1892mcast_solicit
1893-------------
1894
1895Maximum number of retries for multicast solicitation.
1896
1897delay_first_probe_time
1898----------------------
1899
1900Delay for the first time probe if the neighbor is reachable. (see
1901gc_stale_time)
1902
1903locktime
1904--------
1905
1906An ARP/neighbor entry is only replaced with a new one if the old is at least
1907locktime old. This prevents ARP cache thrashing.
1908
1909proxy_delay
1910-----------
1911
1912Maximum time (real time is random [0..proxytime]) before answering to an ARP
1913request for which we have an proxy ARP entry. In some cases, this is used to
1914prevent network flooding.
1915
1916proxy_qlen
1917----------
1918
1919Maximum queue length of the delayed proxy arp timer. (see proxy_delay).
1920
1921app_solicit
1922----------
1923
1924Determines the number of requests to send to the user level ARP daemon. Use 0
1925to turn off.
1926
1927gc_stale_time
1928-------------
1929
1930Determines how often to check for stale ARP entries. After an ARP entry is
1931stale it will be resolved again (which is useful when an IP address migrates
1932to another machine). When ucast_solicit is greater than 0 it first tries to
1933send an ARP packet directly to the known host When that fails and
1934mcast_solicit is greater than 0, an ARP request is broadcasted.
1935
19362.9 Appletalk
1937-------------
1938
1939The /proc/sys/net/appletalk directory holds the Appletalk configuration data
1940when Appletalk is loaded. The configurable parameters are:
1941
1942aarp-expiry-time
1943----------------
1944
1945The amount of time we keep an ARP entry before expiring it. Used to age out
1946old hosts.
1947
1948aarp-resolve-time
1949-----------------
1950
1951The amount of time we will spend trying to resolve an Appletalk address.
1952
1953aarp-retransmit-limit
1954---------------------
1955
1956The number of times we will retransmit a query before giving up.
1957
1958aarp-tick-time
1959--------------
1960
1961Controls the rate at which expires are checked.
1962
1963The directory /proc/net/appletalk holds the list of active Appletalk sockets
1964on a machine.
1965
1966The fields indicate the DDP type, the local address (in network:node format)
1967the remote address, the size of the transmit pending queue, the size of the
1968received queue (bytes waiting for applications to read) the state and the uid
1969owning the socket.
1970
1971/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
1972shows the name of the interface, its Appletalk address, the network range on
1973that address (or network number for phase 1 networks), and the status of the
1974interface.
1975
1976/proc/net/atalk_route lists each known network route. It lists the target
1977(network) that the route leads to, the router (may be directly connected), the
1978route flags, and the device the route is using.
1979
19802.10 IPX
1981--------
1982
1983The IPX protocol has no tunable values in proc/sys/net.
1984
1985The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
1986socket giving the local and remote addresses in Novell format (that is
1987network:node:port). In accordance with the strange Novell tradition,
1988everything but the port is in hex. Not_Connected is displayed for sockets that
1989are not tied to a specific remote address. The Tx and Rx queue sizes indicate
1990the number of bytes pending for transmission and reception. The state
1991indicates the state the socket is in and the uid is the owning uid of the
1992socket.
1993
1994The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
1995it gives the network number, the node number, and indicates if the network is
1996the primary network. It also indicates which device it is bound to (or
1997Internal for internal networks) and the Frame Type if appropriate. Linux
1998supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
1999IPX.
2000
2001The /proc/net/ipx_route table holds a list of IPX routes. For each route it
2002gives the destination network, the router node (or Directly) and the network
2003address of the router (or Connected) for internal networks.
2004
20052.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
2006----------------------------------------------------------
2007
2008The "mqueue" filesystem provides the necessary kernel features to enable the
2009creation of a user space library that implements the POSIX message queues
2010API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
2011Interfaces specification.)
2012
2013The "mqueue" filesystem contains values for determining/setting the amount of
2014resources used by the file system.
2015
2016/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
2017maximum number of message queues allowed on the system.
2018
2019/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
2020maximum number of messages in a queue value. In fact it is the limiting value
2021for another (user) limit which is set in mq_open invocation. This attribute of
2022a queue must be less or equal then msg_max.
2023
2024/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
2025maximum message size value (it is every message queue's attribute set during
2026its creation).
2027 1002
20282.12 /proc/<pid>/oom_adj - Adjust the oom-killer score 10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
2029------------------------------------------------------ 1004------------------------------------------------------
2030 1005
2031This file can be used to adjust the score used to select which processes 1006This file can be used to adjust the score used to select which processes
@@ -2062,25 +1037,15 @@ The task with the highest badness score is then selected and its children
2062are killed, process itself will be killed in an OOM situation when it does 1037are killed, process itself will be killed in an OOM situation when it does
2063not have children or some of them disabled oom like described above. 1038not have children or some of them disabled oom like described above.
2064 1039
20652.13 /proc/<pid>/oom_score - Display current oom-killer score 10403.2 /proc/<pid>/oom_score - Display current oom-killer score
2066------------------------------------------------------------- 1041-------------------------------------------------------------
2067 1042
2068------------------------------------------------------------------------------
2069This file can be used to check the current score used by the oom-killer is for 1043This file can be used to check the current score used by the oom-killer is for
2070any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which 1044any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
2071process should be killed in an out-of-memory situation. 1045process should be killed in an out-of-memory situation.
2072 1046
2073------------------------------------------------------------------------------
2074Summary
2075------------------------------------------------------------------------------
2076Certain aspects of kernel behavior can be modified at runtime, without the
2077need to recompile the kernel, or even to reboot the system. The files in the
2078/proc/sys tree can not only be read, but also modified. You can use the echo
2079command to write value into these files, thereby changing the default settings
2080of the kernel.
2081------------------------------------------------------------------------------
2082 1047
20832.14 /proc/<pid>/io - Display the IO accounting fields 10483.3 /proc/<pid>/io - Display the IO accounting fields
2084------------------------------------------------------- 1049-------------------------------------------------------
2085 1050
2086This file contains IO statistics for each running process 1051This file contains IO statistics for each running process
@@ -2182,7 +1147,7 @@ those 64-bit counters, process A could see an intermediate result.
2182More information about this can be found within the taskstats documentation in 1147More information about this can be found within the taskstats documentation in
2183Documentation/accounting. 1148Documentation/accounting.
2184 1149
21852.15 /proc/<pid>/coredump_filter - Core dump filtering settings 11503.4 /proc/<pid>/coredump_filter - Core dump filtering settings
2186--------------------------------------------------------------- 1151---------------------------------------------------------------
2187When a process is dumped, all anonymous memory is written to a core file as 1152When a process is dumped, all anonymous memory is written to a core file as
2188long as the size of the core file isn't limited. But sometimes we don't want 1153long as the size of the core file isn't limited. But sometimes we don't want
@@ -2226,7 +1191,7 @@ For example:
2226 $ echo 0x7 > /proc/self/coredump_filter 1191 $ echo 0x7 > /proc/self/coredump_filter
2227 $ ./some_program 1192 $ ./some_program
2228 1193
22292.16 /proc/<pid>/mountinfo - Information about mounts 11943.5 /proc/<pid>/mountinfo - Information about mounts
2230-------------------------------------------------------- 1195--------------------------------------------------------
2231 1196
2232This file contains lines of the form: 1197This file contains lines of the form:
@@ -2263,30 +1228,3 @@ For more information on mount propagation see:
2263 1228
2264 Documentation/filesystems/sharedsubtree.txt 1229 Documentation/filesystems/sharedsubtree.txt
2265 1230
22662.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
2267--------------------------------------------------------
2268
2269This directory contains configuration options for the epoll(7) interface.
2270
2271max_user_instances
2272------------------
2273
2274This is the maximum number of epoll file descriptors that a single user can
2275have open at a given time. The default value is 128, and should be enough
2276for normal users.
2277
2278max_user_watches
2279----------------
2280
2281Every epoll file descriptor can store a number of files to be monitored
2282for event readiness. Each one of these monitored files constitutes a "watch".
2283This configuration option sets the maximum number of "watches" that are
2284allowed for each user.
2285Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
2286on a 64bit one.
2287The current default value for max_user_watches is the 1/32 of the available
2288low memory, divided for the "watch" cost in bytes.
2289
2290
2291------------------------------------------------------------------------------
2292
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 9f8740ca3f3b..26e4b8bc53ee 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -12,6 +12,7 @@ that support it. For example, a given bus might look like this:
12 | |-- enable 12 | |-- enable
13 | |-- irq 13 | |-- irq
14 | |-- local_cpus 14 | |-- local_cpus
15 | |-- remove
15 | |-- resource 16 | |-- resource
16 | |-- resource0 17 | |-- resource0
17 | |-- resource1 18 | |-- resource1
@@ -36,6 +37,7 @@ files, each with their own function.
36 enable Whether the device is enabled (ascii, rw) 37 enable Whether the device is enabled (ascii, rw)
37 irq IRQ number (ascii, ro) 38 irq IRQ number (ascii, ro)
38 local_cpus nearby CPU mask (cpumask, ro) 39 local_cpus nearby CPU mask (cpumask, ro)
40 remove remove device from kernel's list (ascii, wo)
39 resource PCI resource host addresses (ascii, ro) 41 resource PCI resource host addresses (ascii, ro)
40 resource0..N PCI resource N, if present (binary, mmap) 42 resource0..N PCI resource N, if present (binary, mmap)
41 resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) 43 resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap)
@@ -46,6 +48,7 @@ files, each with their own function.
46 48
47 ro - read only file 49 ro - read only file
48 rw - file is readable and writable 50 rw - file is readable and writable
51 wo - write only file
49 mmap - file is mmapable 52 mmap - file is mmapable
50 ascii - file contains ascii text 53 ascii - file contains ascii text
51 binary - file contains binary data 54 binary - file contains binary data
@@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully.
73In the event a driver is not bound to the device, it can be enabled using the 76In the event a driver is not bound to the device, it can be enabled using the
74'enable' file, documented above. 77'enable' file, documented above.
75 78
79The 'remove' file is used to remove the PCI device, by writing a non-zero
80integer to the file. This does not involve any kind of hot-plug functionality,
81e.g. powering off the device. The device is removed from the kernel's list of
82PCI devices, the sysfs directory for it is removed, and the device will be
83removed from any drivers attached to it. Removal of PCI root buses is
84disallowed.
85
76Accessing legacy resources through sysfs 86Accessing legacy resources through sysfs
77---------------------------------------- 87----------------------------------------
78 88
diff --git a/Documentation/filesystems/udf.txt b/Documentation/filesystems/udf.txt
index fde829a756e6..902b95d0ee51 100644
--- a/Documentation/filesystems/udf.txt
+++ b/Documentation/filesystems/udf.txt
@@ -24,6 +24,8 @@ The following mount options are supported:
24 24
25 gid= Set the default group. 25 gid= Set the default group.
26 umask= Set the default umask. 26 umask= Set the default umask.
27 mode= Set the default file permissions.
28 dmode= Set the default directory permissions.
27 uid= Set the default user. 29 uid= Set the default user.
28 bs= Set the block size. 30 bs= Set the block size.
29 unhide Show otherwise hidden files. 31 unhide Show otherwise hidden files.
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 803b1318b13d..fd9a3e693813 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -15,31 +15,31 @@ Introduction
15 15
16Ftrace is an internal tracer designed to help out developers and 16Ftrace is an internal tracer designed to help out developers and
17designers of systems to find what is going on inside the kernel. 17designers of systems to find what is going on inside the kernel.
18It can be used for debugging or analyzing latencies and performance 18It can be used for debugging or analyzing latencies and
19issues that take place outside of user-space. 19performance issues that take place outside of user-space.
20 20
21Although ftrace is the function tracer, it also includes an 21Although ftrace is the function tracer, it also includes an
22infrastructure that allows for other types of tracing. Some of the 22infrastructure that allows for other types of tracing. Some of
23tracers that are currently in ftrace include a tracer to trace 23the tracers that are currently in ftrace include a tracer to
24context switches, the time it takes for a high priority task to 24trace context switches, the time it takes for a high priority
25run after it was woken up, the time interrupts are disabled, and 25task to run after it was woken up, the time interrupts are
26more (ftrace allows for tracer plugins, which means that the list of 26disabled, and more (ftrace allows for tracer plugins, which
27tracers can always grow). 27means that the list of tracers can always grow).
28 28
29 29
30The File System 30The File System
31--------------- 31---------------
32 32
33Ftrace uses the debugfs file system to hold the control files as well 33Ftrace uses the debugfs file system to hold the control files as
34as the files to display output. 34well as the files to display output.
35 35
36To mount the debugfs system: 36To mount the debugfs system:
37 37
38 # mkdir /debug 38 # mkdir /debug
39 # mount -t debugfs nodev /debug 39 # mount -t debugfs nodev /debug
40 40
41(Note: it is more common to mount at /sys/kernel/debug, but for simplicity 41( Note: it is more common to mount at /sys/kernel/debug, but for
42 this document will use /debug) 42 simplicity this document will use /debug)
43 43
44That's it! (assuming that you have ftrace configured into your kernel) 44That's it! (assuming that you have ftrace configured into your kernel)
45 45
@@ -50,90 +50,124 @@ of ftrace. Here is a list of some of the key files:
50 50
51 Note: all time values are in microseconds. 51 Note: all time values are in microseconds.
52 52
53 current_tracer: This is used to set or display the current tracer 53 current_tracer:
54 that is configured. 54
55 55 This is used to set or display the current tracer
56 available_tracers: This holds the different types of tracers that 56 that is configured.
57 have been compiled into the kernel. The tracers 57
58 listed here can be configured by echoing their name 58 available_tracers:
59 into current_tracer. 59
60 60 This holds the different types of tracers that
61 tracing_enabled: This sets or displays whether the current_tracer 61 have been compiled into the kernel. The
62 is activated and tracing or not. Echo 0 into this 62 tracers listed here can be configured by
63 file to disable the tracer or 1 to enable it. 63 echoing their name into current_tracer.
64 64
65 trace: This file holds the output of the trace in a human readable 65 tracing_enabled:
66 format (described below). 66
67 67 This sets or displays whether the current_tracer
68 latency_trace: This file shows the same trace but the information 68 is activated and tracing or not. Echo 0 into this
69 is organized more to display possible latencies 69 file to disable the tracer or 1 to enable it.
70 in the system (described below). 70
71 71 trace:
72 trace_pipe: The output is the same as the "trace" file but this 72
73 file is meant to be streamed with live tracing. 73 This file holds the output of the trace in a human
74 Reads from this file will block until new data 74 readable format (described below).
75 is retrieved. Unlike the "trace" and "latency_trace" 75
76 files, this file is a consumer. This means reading 76 latency_trace:
77 from this file causes sequential reads to display 77
78 more current data. Once data is read from this 78 This file shows the same trace but the information
79 file, it is consumed, and will not be read 79 is organized more to display possible latencies
80 again with a sequential read. The "trace" and 80 in the system (described below).
81 "latency_trace" files are static, and if the 81
82 tracer is not adding more data, they will display 82 trace_pipe:
83 the same information every time they are read. 83
84 84 The output is the same as the "trace" file but this
85 trace_options: This file lets the user control the amount of data 85 file is meant to be streamed with live tracing.
86 that is displayed in one of the above output 86 Reads from this file will block until new data
87 files. 87 is retrieved. Unlike the "trace" and "latency_trace"
88 88 files, this file is a consumer. This means reading
89 trace_max_latency: Some of the tracers record the max latency. 89 from this file causes sequential reads to display
90 For example, the time interrupts are disabled. 90 more current data. Once data is read from this
91 This time is saved in this file. The max trace 91 file, it is consumed, and will not be read
92 will also be stored, and displayed by either 92 again with a sequential read. The "trace" and
93 "trace" or "latency_trace". A new max trace will 93 "latency_trace" files are static, and if the
94 only be recorded if the latency is greater than 94 tracer is not adding more data, they will display
95 the value in this file. (in microseconds) 95 the same information every time they are read.
96 96
97 buffer_size_kb: This sets or displays the number of kilobytes each CPU 97 trace_options:
98 buffer can hold. The tracer buffers are the same size 98
99 for each CPU. The displayed number is the size of the 99 This file lets the user control the amount of data
100 CPU buffer and not total size of all buffers. The 100 that is displayed in one of the above output
101 trace buffers are allocated in pages (blocks of memory 101 files.
102 that the kernel uses for allocation, usually 4 KB in size). 102
103 If the last page allocated has room for more bytes 103 tracing_max_latency:
104 than requested, the rest of the page will be used, 104
105 making the actual allocation bigger than requested. 105 Some of the tracers record the max latency.
106 (Note, the size may not be a multiple of the page size due 106 For example, the time interrupts are disabled.
107 to buffer managment overhead.) 107 This time is saved in this file. The max trace
108 108 will also be stored, and displayed by either
109 This can only be updated when the current_tracer 109 "trace" or "latency_trace". A new max trace will
110 is set to "nop". 110 only be recorded if the latency is greater than
111 111 the value in this file. (in microseconds)
112 tracing_cpumask: This is a mask that lets the user only trace 112
113 on specified CPUS. The format is a hex string 113 buffer_size_kb:
114 representing the CPUS. 114
115 115 This sets or displays the number of kilobytes each CPU
116 set_ftrace_filter: When dynamic ftrace is configured in (see the 116 buffer can hold. The tracer buffers are the same size
117 section below "dynamic ftrace"), the code is dynamically 117 for each CPU. The displayed number is the size of the
118 modified (code text rewrite) to disable calling of the 118 CPU buffer and not total size of all buffers. The
119 function profiler (mcount). This lets tracing be configured 119 trace buffers are allocated in pages (blocks of memory
120 in with practically no overhead in performance. This also 120 that the kernel uses for allocation, usually 4 KB in size).
121 has a side effect of enabling or disabling specific functions 121 If the last page allocated has room for more bytes
122 to be traced. Echoing names of functions into this file 122 than requested, the rest of the page will be used,
123 will limit the trace to only those functions. 123 making the actual allocation bigger than requested.
124 124 ( Note, the size may not be a multiple of the page size
125 set_ftrace_notrace: This has an effect opposite to that of 125 due to buffer managment overhead. )
126 set_ftrace_filter. Any function that is added here will not 126
127 be traced. If a function exists in both set_ftrace_filter 127 This can only be updated when the current_tracer
128 and set_ftrace_notrace, the function will _not_ be traced. 128 is set to "nop".
129 129
130 set_ftrace_pid: Have the function tracer only trace a single thread. 130 tracing_cpumask:
131 131
132 available_filter_functions: This lists the functions that ftrace 132 This is a mask that lets the user only trace
133 has processed and can trace. These are the function 133 on specified CPUS. The format is a hex string
134 names that you can pass to "set_ftrace_filter" or 134 representing the CPUS.
135 "set_ftrace_notrace". (See the section "dynamic ftrace" 135
136 below for more details.) 136 set_ftrace_filter:
137
138 When dynamic ftrace is configured in (see the
139 section below "dynamic ftrace"), the code is dynamically
140 modified (code text rewrite) to disable calling of the
141 function profiler (mcount). This lets tracing be configured
142 in with practically no overhead in performance. This also
143 has a side effect of enabling or disabling specific functions
144 to be traced. Echoing names of functions into this file
145 will limit the trace to only those functions.
146
147 set_ftrace_notrace:
148
149 This has an effect opposite to that of
150 set_ftrace_filter. Any function that is added here will not
151 be traced. If a function exists in both set_ftrace_filter
152 and set_ftrace_notrace, the function will _not_ be traced.
153
154 set_ftrace_pid:
155
156 Have the function tracer only trace a single thread.
157
158 set_graph_function:
159
160 Set a "trigger" function where tracing should start
161 with the function graph tracer (See the section
162 "dynamic ftrace" for more details).
163
164 available_filter_functions:
165
166 This lists the functions that ftrace
167 has processed and can trace. These are the function
168 names that you can pass to "set_ftrace_filter" or
169 "set_ftrace_notrace". (See the section "dynamic ftrace"
170 below for more details.)
137 171
138 172
139The Tracers 173The Tracers
@@ -141,36 +175,66 @@ The Tracers
141 175
142Here is the list of current tracers that may be configured. 176Here is the list of current tracers that may be configured.
143 177
144 function - function tracer that uses mcount to trace all functions. 178 "function"
179
180 Function call tracer to trace all kernel functions.
181
182 "function_graph_tracer"
183
184 Similar to the function tracer except that the
185 function tracer probes the functions on their entry
186 whereas the function graph tracer traces on both entry
187 and exit of the functions. It then provides the ability
188 to draw a graph of function calls similar to C code
189 source.
145 190
146 sched_switch - traces the context switches between tasks. 191 "sched_switch"
147 192
148 irqsoff - traces the areas that disable interrupts and saves 193 Traces the context switches and wakeups between tasks.
149 the trace with the longest max latency.
150 See tracing_max_latency. When a new max is recorded,
151 it replaces the old trace. It is best to view this
152 trace via the latency_trace file.
153 194
154 preemptoff - Similar to irqsoff but traces and records the amount of 195 "irqsoff"
155 time for which preemption is disabled.
156 196
157 preemptirqsoff - Similar to irqsoff and preemptoff, but traces and 197 Traces the areas that disable interrupts and saves
158 records the largest time for which irqs and/or preemption 198 the trace with the longest max latency.
159 is disabled. 199 See tracing_max_latency. When a new max is recorded,
200 it replaces the old trace. It is best to view this
201 trace via the latency_trace file.
160 202
161 wakeup - Traces and records the max latency that it takes for 203 "preemptoff"
162 the highest priority task to get scheduled after
163 it has been woken up.
164 204
165 nop - This is not a tracer. To remove all tracers from tracing 205 Similar to irqsoff but traces and records the amount of
166 simply echo "nop" into current_tracer. 206 time for which preemption is disabled.
207
208 "preemptirqsoff"
209
210 Similar to irqsoff and preemptoff, but traces and
211 records the largest time for which irqs and/or preemption
212 is disabled.
213
214 "wakeup"
215
216 Traces and records the max latency that it takes for
217 the highest priority task to get scheduled after
218 it has been woken up.
219
220 "hw-branch-tracer"
221
222 Uses the BTS CPU feature on x86 CPUs to traces all
223 branches executed.
224
225 "nop"
226
227 This is the "trace nothing" tracer. To remove all
228 tracers from tracing simply echo "nop" into
229 current_tracer.
167 230
168 231
169Examples of using the tracer 232Examples of using the tracer
170---------------------------- 233----------------------------
171 234
172Here are typical examples of using the tracers when controlling them only 235Here are typical examples of using the tracers when controlling
173with the debugfs interface (without using any user-land utilities). 236them only with the debugfs interface (without using any
237user-land utilities).
174 238
175Output format: 239Output format:
176-------------- 240--------------
@@ -187,16 +251,16 @@ Here is an example of the output format of the file "trace"
187 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput 251 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput
188 -------- 252 --------
189 253
190A header is printed with the tracer name that is represented by the trace. 254A header is printed with the tracer name that is represented by
191In this case the tracer is "function". Then a header showing the format. Task 255the trace. In this case the tracer is "function". Then a header
192name "bash", the task PID "4251", the CPU that it was running on 256showing the format. Task name "bash", the task PID "4251", the
193"01", the timestamp in <secs>.<usecs> format, the function name that was 257CPU that it was running on "01", the timestamp in <secs>.<usecs>
194traced "path_put" and the parent function that called this function 258format, the function name that was traced "path_put" and the
195"path_walk". The timestamp is the time at which the function was 259parent function that called this function "path_walk". The
196entered. 260timestamp is the time at which the function was entered.
197 261
198The sched_switch tracer also includes tracing of task wakeups and 262The sched_switch tracer also includes tracing of task wakeups
199context switches. 263and context switches.
200 264
201 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S 265 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
202 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S 266 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
@@ -205,8 +269,8 @@ context switches.
205 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R 269 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
206 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R 270 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
207 271
208Wake ups are represented by a "+" and the context switches are shown as 272Wake ups are represented by a "+" and the context switches are
209"==>". The format is: 273shown as "==>". The format is:
210 274
211 Context switches: 275 Context switches:
212 276
@@ -220,19 +284,20 @@ Wake ups are represented by a "+" and the context switches are shown as
220 284
221 <pid>:<prio>:<state> + <pid>:<prio>:<state> 285 <pid>:<prio>:<state> + <pid>:<prio>:<state>
222 286
223The prio is the internal kernel priority, which is the inverse of the 287The prio is the internal kernel priority, which is the inverse
224priority that is usually displayed by user-space tools. Zero represents 288of the priority that is usually displayed by user-space tools.
225the highest priority (99). Prio 100 starts the "nice" priorities with 289Zero represents the highest priority (99). Prio 100 starts the
226100 being equal to nice -20 and 139 being nice 19. The prio "140" is 290"nice" priorities with 100 being equal to nice -20 and 139 being
227reserved for the idle task which is the lowest priority thread (pid 0). 291nice 19. The prio "140" is reserved for the idle task which is
292the lowest priority thread (pid 0).
228 293
229 294
230Latency trace format 295Latency trace format
231-------------------- 296--------------------
232 297
233For traces that display latency times, the latency_trace file gives 298For traces that display latency times, the latency_trace file
234somewhat more information to see why a latency happened. Here is a typical 299gives somewhat more information to see why a latency happened.
235trace. 300Here is a typical trace.
236 301
237# tracer: irqsoff 302# tracer: irqsoff
238# 303#
@@ -259,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
259 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq) 324 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq)
260 325
261 326
327This shows that the current tracer is "irqsoff" tracing the time
328for which interrupts were disabled. It gives the trace version
329and the version of the kernel upon which this was executed on
330(2.6.26-rc8). Then it displays the max latency in microsecs (97
331us). The number of trace entries displayed and the total number
332recorded (both are three: #3/3). The type of preemption that was
333used (PREEMPT). VP, KP, SP, and HP are always zero and are
334reserved for later use. #P is the number of online CPUS (#P:2).
262 335
263This shows that the current tracer is "irqsoff" tracing the time for which 336The task is the process that was running when the latency
264interrupts were disabled. It gives the trace version and the version 337occurred. (swapper pid: 0).
265of the kernel upon which this was executed on (2.6.26-rc8). Then it displays
266the max latency in microsecs (97 us). The number of trace entries displayed
267and the total number recorded (both are three: #3/3). The type of
268preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
269and are reserved for later use. #P is the number of online CPUS (#P:2).
270
271The task is the process that was running when the latency occurred.
272(swapper pid: 0).
273 338
274The start and stop (the functions in which the interrupts were disabled and 339The start and stop (the functions in which the interrupts were
275enabled respectively) that caused the latencies: 340disabled and enabled respectively) that caused the latencies:
276 341
277 apic_timer_interrupt is where the interrupts were disabled. 342 apic_timer_interrupt is where the interrupts were disabled.
278 do_softirq is where they were enabled again. 343 do_softirq is where they were enabled again.
@@ -308,12 +373,12 @@ The above is mostly meaningful for kernel developers.
308 latency_trace file is relative to the start of the trace. 373 latency_trace file is relative to the start of the trace.
309 374
310 delay: This is just to help catch your eye a bit better. And 375 delay: This is just to help catch your eye a bit better. And
311 needs to be fixed to be only relative to the same CPU. 376 needs to be fixed to be only relative to the same CPU.
312 The marks are determined by the difference between this 377 The marks are determined by the difference between this
313 current trace and the next trace. 378 current trace and the next trace.
314 '!' - greater than preempt_mark_thresh (default 100) 379 '!' - greater than preempt_mark_thresh (default 100)
315 '+' - greater than 1 microsecond 380 '+' - greater than 1 microsecond
316 ' ' - less than or equal to 1 microsecond. 381 ' ' - less than or equal to 1 microsecond.
317 382
318 The rest is the same as the 'trace' file. 383 The rest is the same as the 'trace' file.
319 384
@@ -321,14 +386,15 @@ The above is mostly meaningful for kernel developers.
321trace_options 386trace_options
322------------- 387-------------
323 388
324The trace_options file is used to control what gets printed in the trace 389The trace_options file is used to control what gets printed in
325output. To see what is available, simply cat the file: 390the trace output. To see what is available, simply cat the file:
326 391
327 cat /debug/tracing/trace_options 392 cat /debug/tracing/trace_options
328 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
329 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
330 395
331To disable one of the options, echo in the option prepended with "no". 396To disable one of the options, echo in the option prepended with
397"no".
332 398
333 echo noprint-parent > /debug/tracing/trace_options 399 echo noprint-parent > /debug/tracing/trace_options
334 400
@@ -338,8 +404,8 @@ To enable an option, leave off the "no".
338 404
339Here are the available options: 405Here are the available options:
340 406
341 print-parent - On function traces, display the calling function 407 print-parent - On function traces, display the calling (parent)
342 as well as the function being traced. 408 function as well as the function being traced.
343 409
344 print-parent: 410 print-parent:
345 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul 411 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul
@@ -348,15 +414,16 @@ Here are the available options:
348 bash-4000 [01] 1477.606694: simple_strtoul 414 bash-4000 [01] 1477.606694: simple_strtoul
349 415
350 416
351 sym-offset - Display not only the function name, but also the offset 417 sym-offset - Display not only the function name, but also the
352 in the function. For example, instead of seeing just 418 offset in the function. For example, instead of
353 "ktime_get", you will see "ktime_get+0xb/0x20". 419 seeing just "ktime_get", you will see
420 "ktime_get+0xb/0x20".
354 421
355 sym-offset: 422 sym-offset:
356 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 423 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0
357 424
358 sym-addr - this will also display the function address as well as 425 sym-addr - this will also display the function address as well
359 the function name. 426 as the function name.
360 427
361 sym-addr: 428 sym-addr:
362 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 429 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
@@ -366,35 +433,41 @@ Here are the available options:
366 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 433 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
367 (+0.000ms): simple_strtoul (strict_strtoul) 434 (+0.000ms): simple_strtoul (strict_strtoul)
368 435
369 raw - This will display raw numbers. This option is best for use with 436 raw - This will display raw numbers. This option is best for
370 user applications that can translate the raw numbers better than 437 use with user applications that can translate the raw
371 having it done in the kernel. 438 numbers better than having it done in the kernel.
372 439
373 hex - Similar to raw, but the numbers will be in a hexadecimal format. 440 hex - Similar to raw, but the numbers will be in a hexadecimal
441 format.
374 442
375 bin - This will print out the formats in raw binary. 443 bin - This will print out the formats in raw binary.
376 444
377 block - TBD (needs update) 445 block - TBD (needs update)
378 446
379 stacktrace - This is one of the options that changes the trace itself. 447 stacktrace - This is one of the options that changes the trace
380 When a trace is recorded, so is the stack of functions. 448 itself. When a trace is recorded, so is the stack
381 This allows for back traces of trace sites. 449 of functions. This allows for back traces of
450 trace sites.
382 451
383 userstacktrace - This option changes the trace. 452 userstacktrace - This option changes the trace. It records a
384 It records a stacktrace of the current userspace thread. 453 stacktrace of the current userspace thread.
385 454
386 sym-userobj - when user stacktrace are enabled, look up which object the 455 sym-userobj - when user stacktrace are enabled, look up which
387 address belongs to, and print a relative address 456 object the address belongs to, and print a
388 This is especially useful when ASLR is on, otherwise you don't 457 relative address. This is especially useful when
389 get a chance to resolve the address to object/file/line after the app is no 458 ASLR is on, otherwise you don't get a chance to
390 longer running 459 resolve the address to object/file/line after
460 the app is no longer running
391 461
392 The lookup is performed when you read trace,trace_pipe,latency_trace. Example: 462 The lookup is performed when you read
463 trace,trace_pipe,latency_trace. Example:
393 464
394 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 465 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
395x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 466x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
396 467
397 sched-tree - TBD (any users??) 468 sched-tree - trace all tasks that are on the runqueue, at
469 every scheduling event. Will add overhead if
470 there's a lot of tasks running at once.
398 471
399 472
400sched_switch 473sched_switch
@@ -431,18 +504,19 @@ of how to use it.
431 [...] 504 [...]
432 505
433 506
434As we have discussed previously about this format, the header shows 507As we have discussed previously about this format, the header
435the name of the trace and points to the options. The "FUNCTION" 508shows the name of the trace and points to the options. The
436is a misnomer since here it represents the wake ups and context 509"FUNCTION" is a misnomer since here it represents the wake ups
437switches. 510and context switches.
438 511
439The sched_switch file only lists the wake ups (represented with '+') 512The sched_switch file only lists the wake ups (represented with
440and context switches ('==>') with the previous task or current task 513'+') and context switches ('==>') with the previous task or
441first followed by the next task or task waking up. The format for both 514current task first followed by the next task or task waking up.
442of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO 515The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
443is the inverse of the actual priority with zero (0) being the highest 516Remember that the KERNEL-PRIO is the inverse of the actual
444priority and the nice values starting at 100 (nice -20). Below is 517priority with zero (0) being the highest priority and the nice
445a quick chart to map the kernel priority to user land priorities. 518values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities.
446 520
447 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0
448 Kernel priority: 100 to 139 ==> user nice -20 to 19 522 Kernel priority: 100 to 139 ==> user nice -20 to 19
@@ -463,10 +537,10 @@ The task states are:
463ftrace_enabled 537ftrace_enabled
464-------------- 538--------------
465 539
466The following tracers (listed below) give different output depending 540The following tracers (listed below) give different output
467on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, 541depending on whether or not the sysctl ftrace_enabled is set. To
468one can either use the sysctl function or set it via the proc 542set ftrace_enabled, one can either use the sysctl function or
469file system interface. 543set it via the proc file system interface.
470 544
471 sysctl kernel.ftrace_enabled=1 545 sysctl kernel.ftrace_enabled=1
472 546
@@ -474,12 +548,12 @@ file system interface.
474 548
475 echo 1 > /proc/sys/kernel/ftrace_enabled 549 echo 1 > /proc/sys/kernel/ftrace_enabled
476 550
477To disable ftrace_enabled simply replace the '1' with '0' in 551To disable ftrace_enabled simply replace the '1' with '0' in the
478the above commands. 552above commands.
479 553
480When ftrace_enabled is set the tracers will also record the functions 554When ftrace_enabled is set the tracers will also record the
481that are within the trace. The descriptions of the tracers 555functions that are within the trace. The descriptions of the
482will also show an example with ftrace enabled. 556tracers will also show an example with ftrace enabled.
483 557
484 558
485irqsoff 559irqsoff
@@ -487,17 +561,18 @@ irqsoff
487 561
488When interrupts are disabled, the CPU can not react to any other 562When interrupts are disabled, the CPU can not react to any other
489external event (besides NMIs and SMIs). This prevents the timer 563external event (besides NMIs and SMIs). This prevents the timer
490interrupt from triggering or the mouse interrupt from letting the 564interrupt from triggering or the mouse interrupt from letting
491kernel know of a new mouse event. The result is a latency with the 565the kernel know of a new mouse event. The result is a latency
492reaction time. 566with the reaction time.
493 567
494The irqsoff tracer tracks the time for which interrupts are disabled. 568The irqsoff tracer tracks the time for which interrupts are
495When a new maximum latency is hit, the tracer saves the trace leading up 569disabled. When a new maximum latency is hit, the tracer saves
496to that latency point so that every time a new maximum is reached, the old 570the trace leading up to that latency point so that every time a
497saved trace is discarded and the new trace is saved. 571new maximum is reached, the old saved trace is discarded and the
572new trace is saved.
498 573
499To reset the maximum, echo 0 into tracing_max_latency. Here is an 574To reset the maximum, echo 0 into tracing_max_latency. Here is
500example: 575an example:
501 576
502 # echo irqsoff > /debug/tracing/current_tracer 577 # echo irqsoff > /debug/tracing/current_tracer
503 # echo 0 > /debug/tracing/tracing_max_latency 578 # echo 0 > /debug/tracing/tracing_max_latency
@@ -532,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26
532 607
533 608
534Here we see that that we had a latency of 12 microsecs (which is 609Here we see that that we had a latency of 12 microsecs (which is
535very good). The _write_lock_irq in sys_setpgid disabled interrupts. 610very good). The _write_lock_irq in sys_setpgid disabled
536The difference between the 12 and the displayed timestamp 14us occurred 611interrupts. The difference between the 12 and the displayed
537because the clock was incremented between the time of recording the max 612timestamp 14us occurred because the clock was incremented
538latency and the time of recording the function that had that latency. 613between the time of recording the max latency and the time of
614recording the function that had that latency.
539 615
540Note the above example had ftrace_enabled not set. If we set the 616Note the above example had ftrace_enabled not set. If we set the
541ftrace_enabled, we get a much larger output: 617ftrace_enabled, we get a much larger output:
@@ -586,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8
586 662
587 663
588Here we traced a 50 microsecond latency. But we also see all the 664Here we traced a 50 microsecond latency. But we also see all the
589functions that were called during that time. Note that by enabling 665functions that were called during that time. Note that by
590function tracing, we incur an added overhead. This overhead may 666enabling function tracing, we incur an added overhead. This
591extend the latency times. But nevertheless, this trace has provided 667overhead may extend the latency times. But nevertheless, this
592some very helpful debugging information. 668trace has provided some very helpful debugging information.
593 669
594 670
595preemptoff 671preemptoff
596---------- 672----------
597 673
598When preemption is disabled, we may be able to receive interrupts but 674When preemption is disabled, we may be able to receive
599the task cannot be preempted and a higher priority task must wait 675interrupts but the task cannot be preempted and a higher
600for preemption to be enabled again before it can preempt a lower 676priority task must wait for preemption to be enabled again
601priority task. 677before it can preempt a lower priority task.
602 678
603The preemptoff tracer traces the places that disable preemption. 679The preemptoff tracer traces the places that disable preemption.
604Like the irqsoff tracer, it records the maximum latency for which preemption 680Like the irqsoff tracer, it records the maximum latency for
605was disabled. The control of preemptoff tracer is much like the irqsoff 681which preemption was disabled. The control of preemptoff tracer
606tracer. 682is much like the irqsoff tracer.
607 683
608 # echo preemptoff > /debug/tracing/current_tracer 684 # echo preemptoff > /debug/tracing/current_tracer
609 # echo 0 > /debug/tracing/tracing_max_latency 685 # echo 0 > /debug/tracing/tracing_max_latency
@@ -637,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
637 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) 713 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq)
638 714
639 715
640This has some more changes. Preemption was disabled when an interrupt 716This has some more changes. Preemption was disabled when an
641came in (notice the 'h'), and was enabled while doing a softirq. 717interrupt came in (notice the 'h'), and was enabled while doing
642(notice the 's'). But we also see that interrupts have been disabled 718a softirq. (notice the 's'). But we also see that interrupts
643when entering the preempt off section and leaving it (the 'd'). 719have been disabled when entering the preempt off section and
644We do not know if interrupts were enabled in the mean time. 720leaving it (the 'd'). We do not know if interrupts were enabled
721in the mean time.
645 722
646# tracer: preemptoff 723# tracer: preemptoff
647# 724#
@@ -700,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8
700 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) 777 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq)
701 778
702 779
703The above is an example of the preemptoff trace with ftrace_enabled 780The above is an example of the preemptoff trace with
704set. Here we see that interrupts were disabled the entire time. 781ftrace_enabled set. Here we see that interrupts were disabled
705The irq_enter code lets us know that we entered an interrupt 'h'. 782the entire time. The irq_enter code lets us know that we entered
706Before that, the functions being traced still show that it is not 783an interrupt 'h'. Before that, the functions being traced still
707in an interrupt, but we can see from the functions themselves that 784show that it is not in an interrupt, but we can see from the
708this is not the case. 785functions themselves that this is not the case.
709 786
710Notice that __do_softirq when called does not have a preempt_count. 787Notice that __do_softirq when called does not have a
711It may seem that we missed a preempt enabling. What really happened 788preempt_count. It may seem that we missed a preempt enabling.
712is that the preempt count is held on the thread's stack and we 789What really happened is that the preempt count is held on the
713switched to the softirq stack (4K stacks in effect). The code 790thread's stack and we switched to the softirq stack (4K stacks
714does not copy the preempt count, but because interrupts are disabled, 791in effect). The code does not copy the preempt count, but
715we do not need to worry about it. Having a tracer like this is good 792because interrupts are disabled, we do not need to worry about
716for letting people know what really happens inside the kernel. 793it. Having a tracer like this is good for letting people know
794what really happens inside the kernel.
717 795
718 796
719preemptirqsoff 797preemptirqsoff
720-------------- 798--------------
721 799
722Knowing the locations that have interrupts disabled or preemption 800Knowing the locations that have interrupts disabled or
723disabled for the longest times is helpful. But sometimes we would 801preemption disabled for the longest times is helpful. But
724like to know when either preemption and/or interrupts are disabled. 802sometimes we would like to know when either preemption and/or
803interrupts are disabled.
725 804
726Consider the following code: 805Consider the following code:
727 806
@@ -741,11 +820,13 @@ The preemptoff tracer will record the total length of
741call_function_with_irqs_and_preemption_off() and 820call_function_with_irqs_and_preemption_off() and
742call_function_with_preemption_off(). 821call_function_with_preemption_off().
743 822
744But neither will trace the time that interrupts and/or preemption 823But neither will trace the time that interrupts and/or
745is disabled. This total time is the time that we can not schedule. 824preemption is disabled. This total time is the time that we can
746To record this time, use the preemptirqsoff tracer. 825not schedule. To record this time, use the preemptirqsoff
826tracer.
747 827
748Again, using this trace is much like the irqsoff and preemptoff tracers. 828Again, using this trace is much like the irqsoff and preemptoff
829tracers.
749 830
750 # echo preemptirqsoff > /debug/tracing/current_tracer 831 # echo preemptirqsoff > /debug/tracing/current_tracer
751 # echo 0 > /debug/tracing/tracing_max_latency 832 # echo 0 > /debug/tracing/tracing_max_latency
@@ -781,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
781 862
782 863
783The trace_hardirqs_off_thunk is called from assembly on x86 when 864The trace_hardirqs_off_thunk is called from assembly on x86 when
784interrupts are disabled in the assembly code. Without the function 865interrupts are disabled in the assembly code. Without the
785tracing, we do not know if interrupts were enabled within the preemption 866function tracing, we do not know if interrupts were enabled
786points. We do see that it started with preemption enabled. 867within the preemption points. We do see that it started with
868preemption enabled.
787 869
788Here is a trace with ftrace_enabled set: 870Here is a trace with ftrace_enabled set:
789 871
@@ -871,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
871 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) 953 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq)
872 954
873 955
874This is a very interesting trace. It started with the preemption of 956This is a very interesting trace. It started with the preemption
875the ls task. We see that the task had the "need_resched" bit set 957of the ls task. We see that the task had the "need_resched" bit
876via the 'N' in the trace. Interrupts were disabled before the spin_lock 958set via the 'N' in the trace. Interrupts were disabled before
877at the beginning of the trace. We see that a schedule took place to run 959the spin_lock at the beginning of the trace. We see that a
878sshd. When the interrupts were enabled, we took an interrupt. 960schedule took place to run sshd. When the interrupts were
879On return from the interrupt handler, the softirq ran. We took another 961enabled, we took an interrupt. On return from the interrupt
880interrupt while running the softirq as we see from the capital 'H'. 962handler, the softirq ran. We took another interrupt while
963running the softirq as we see from the capital 'H'.
881 964
882 965
883wakeup 966wakeup
884------ 967------
885 968
886In a Real-Time environment it is very important to know the wakeup 969In a Real-Time environment it is very important to know the
887time it takes for the highest priority task that is woken up to the 970wakeup time it takes for the highest priority task that is woken
888time that it executes. This is also known as "schedule latency". 971up to the time that it executes. This is also known as "schedule
889I stress the point that this is about RT tasks. It is also important 972latency". I stress the point that this is about RT tasks. It is
890to know the scheduling latency of non-RT tasks, but the average 973also important to know the scheduling latency of non-RT tasks,
891schedule latency is better for non-RT tasks. Tools like 974but the average schedule latency is better for non-RT tasks.
892LatencyTop are more appropriate for such measurements. 975Tools like LatencyTop are more appropriate for such
976measurements.
893 977
894Real-Time environments are interested in the worst case latency. 978Real-Time environments are interested in the worst case latency.
895That is the longest latency it takes for something to happen, and 979That is the longest latency it takes for something to happen,
896not the average. We can have a very fast scheduler that may only 980and not the average. We can have a very fast scheduler that may
897have a large latency once in a while, but that would not work well 981only have a large latency once in a while, but that would not
898with Real-Time tasks. The wakeup tracer was designed to record 982work well with Real-Time tasks. The wakeup tracer was designed
899the worst case wakeups of RT tasks. Non-RT tasks are not recorded 983to record the worst case wakeups of RT tasks. Non-RT tasks are
900because the tracer only records one worst case and tracing non-RT 984not recorded because the tracer only records one worst case and
901tasks that are unpredictable will overwrite the worst case latency 985tracing non-RT tasks that are unpredictable will overwrite the
902of RT tasks. 986worst case latency of RT tasks.
903 987
904Since this tracer only deals with RT tasks, we will run this slightly 988Since this tracer only deals with RT tasks, we will run this
905differently than we did with the previous tracers. Instead of performing 989slightly differently than we did with the previous tracers.
906an 'ls', we will run 'sleep 1' under 'chrt' which changes the 990Instead of performing an 'ls', we will run 'sleep 1' under
907priority of the task. 991'chrt' which changes the priority of the task.
908 992
909 # echo wakeup > /debug/tracing/current_tracer 993 # echo wakeup > /debug/tracing/current_tracer
910 # echo 0 > /debug/tracing/tracing_max_latency 994 # echo 0 > /debug/tracing/tracing_max_latency
@@ -934,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc8
934 <idle>-0 1d..4 4us : schedule (cpu_idle) 1018 <idle>-0 1d..4 4us : schedule (cpu_idle)
935 1019
936 1020
1021Running this on an idle system, we see that it only took 4
1022microseconds to perform the task switch. Note, since the trace
1023marker in the schedule is before the actual "switch", we stop
1024the tracing when the recorded task is about to schedule in. This
1025may change if we add a new marker at the end of the scheduler.
937 1026
938Running this on an idle system, we see that it only took 4 microseconds 1027Notice that the recorded task is 'sleep' with the PID of 4901
939to perform the task switch. Note, since the trace marker in the 1028and it has an rt_prio of 5. This priority is user-space priority
940schedule is before the actual "switch", we stop the tracing when 1029and not the internal kernel priority. The policy is 1 for
941the recorded task is about to schedule in. This may change if 1030SCHED_FIFO and 2 for SCHED_RR.
942we add a new marker at the end of the scheduler.
943
944Notice that the recorded task is 'sleep' with the PID of 4901 and it
945has an rt_prio of 5. This priority is user-space priority and not
946the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
947for SCHED_RR.
948 1031
949Doing the same with chrt -r 5 and ftrace_enabled set. 1032Doing the same with chrt -r 5 and ftrace_enabled set.
950 1033
@@ -1001,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline)
1001ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) 1084ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock)
1002ksoftirq-7 1d..4 50us : schedule (__cond_resched) 1085ksoftirq-7 1d..4 50us : schedule (__cond_resched)
1003 1086
1004The interrupt went off while running ksoftirqd. This task runs at 1087The interrupt went off while running ksoftirqd. This task runs
1005SCHED_OTHER. Why did not we see the 'N' set early? This may be 1088at SCHED_OTHER. Why did not we see the 'N' set early? This may
1006a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks 1089be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K
1007configured, the interrupt and softirq run with their own stack. 1090stacks configured, the interrupt and softirq run with their own
1008Some information is held on the top of the task's stack (need_resched 1091stack. Some information is held on the top of the task's stack
1009and preempt_count are both stored there). The setting of the NEED_RESCHED 1092(need_resched and preempt_count are both stored there). The
1010bit is done directly to the task's stack, but the reading of the 1093setting of the NEED_RESCHED bit is done directly to the task's
1011NEED_RESCHED is done by looking at the current stack, which in this case 1094stack, but the reading of the NEED_RESCHED is done by looking at
1012is the stack for the hard interrupt. This hides the fact that NEED_RESCHED 1095the current stack, which in this case is the stack for the hard
1013has been set. We do not see the 'N' until we switch back to the task's 1096interrupt. This hides the fact that NEED_RESCHED has been set.
1097We do not see the 'N' until we switch back to the task's
1014assigned stack. 1098assigned stack.
1015 1099
1016function 1100function
1017-------- 1101--------
1018 1102
1019This tracer is the function tracer. Enabling the function tracer 1103This tracer is the function tracer. Enabling the function tracer
1020can be done from the debug file system. Make sure the ftrace_enabled is 1104can be done from the debug file system. Make sure the
1021set; otherwise this tracer is a nop. 1105ftrace_enabled is set; otherwise this tracer is a nop.
1022 1106
1023 # sysctl kernel.ftrace_enabled=1 1107 # sysctl kernel.ftrace_enabled=1
1024 # echo function > /debug/tracing/current_tracer 1108 # echo function > /debug/tracing/current_tracer
@@ -1048,14 +1132,15 @@ set; otherwise this tracer is a nop.
1048[...] 1132[...]
1049 1133
1050 1134
1051Note: function tracer uses ring buffers to store the above entries. 1135Note: function tracer uses ring buffers to store the above
1052The newest data may overwrite the oldest data. Sometimes using echo to 1136entries. The newest data may overwrite the oldest data.
1053stop the trace is not sufficient because the tracing could have overwritten 1137Sometimes using echo to stop the trace is not sufficient because
1054the data that you wanted to record. For this reason, it is sometimes better to 1138the tracing could have overwritten the data that you wanted to
1055disable tracing directly from a program. This allows you to stop the 1139record. For this reason, it is sometimes better to disable
1056tracing at the point that you hit the part that you are interested in. 1140tracing directly from a program. This allows you to stop the
1057To disable the tracing directly from a C program, something like following 1141tracing at the point that you hit the part that you are
1058code snippet can be used: 1142interested in. To disable the tracing directly from a C program,
1143something like following code snippet can be used:
1059 1144
1060int trace_fd; 1145int trace_fd;
1061[...] 1146[...]
@@ -1070,10 +1155,10 @@ int main(int argc, char *argv[]) {
1070} 1155}
1071 1156
1072Note: Here we hard coded the path name. The debugfs mount is not 1157Note: Here we hard coded the path name. The debugfs mount is not
1073guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). 1158guaranteed to be at /debug (and is more commonly at
1074For simple one time traces, the above is sufficent. For anything else, 1159/sys/kernel/debug). For simple one time traces, the above is
1075a search through /proc/mounts may be needed to find where the debugfs 1160sufficent. For anything else, a search through /proc/mounts may
1076file-system is mounted. 1161be needed to find where the debugfs file-system is mounted.
1077 1162
1078 1163
1079Single thread tracing 1164Single thread tracing
@@ -1152,49 +1237,297 @@ int main (int argc, char **argv)
1152 return 0; 1237 return 0;
1153} 1238}
1154 1239
1240
1241hw-branch-tracer (x86 only)
1242---------------------------
1243
1244This tracer uses the x86 last branch tracing hardware feature to
1245collect a branch trace on all cpus with relatively low overhead.
1246
1247The tracer uses a fixed-size circular buffer per cpu and only
1248traces ring 0 branches. The trace file dumps that buffer in the
1249following format:
1250
1251# tracer: hw-branch-tracer
1252#
1253# CPU# TO <- FROM
1254 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6
1255 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a
1256 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf
1257 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf
1258 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a
1259 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf
1260
1261
1262The tracer may be used to dump the trace for the oops'ing cpu on
1263a kernel oops into the system log. To enable this,
1264ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1265can either use the sysctl function or set it via the proc system
1266interface.
1267
1268 sysctl kernel.ftrace_dump_on_oops=1
1269
1270or
1271
1272 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
1273
1274
1275Here's an example of such a dump after a null pointer
1276dereference in a kernel module:
1277
1278[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
1279[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops]
1280[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0
1281[57848.106019] Oops: 0002 [#1] SMP
1282[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus
1283[57848.106019] Dumping ftrace buffer:
1284[57848.106019] ---------------------------------
1285[...]
1286[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24
1287[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165
1288[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165
1289[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165
1290[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165
1291[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops]
1292[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30
1293[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b
1294[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31
1295[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1
1296[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30
1297[...]
1298[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2
1299[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881
1300[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881
1301[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96
1302[...]
1303[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3
1304[57848.106019] ---------------------------------
1305[57848.106019] CPU 0
1306[57848.106019] Modules linked in: oops
1307[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23
1308[57848.106019] RIP: 0010:[<ffffffffa0000006>] [<ffffffffa0000006>] open+0x6/0x14 [oops]
1309[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246
1310[...]
1311
1312
1313function graph tracer
1314---------------------------
1315
1316This tracer is similar to the function tracer except that it
1317probes a function on its entry and its exit. This is done by
1318using a dynamically allocated stack of return addresses in each
1319task_struct. On function entry the tracer overwrites the return
1320address of each function traced to set a custom probe. Thus the
1321original return address is stored on the stack of return address
1322in the task_struct.
1323
1324Probing on both ends of a function leads to special features
1325such as:
1326
1327- measure of a function's time execution
1328- having a reliable call stack to draw function calls graph
1329
1330This tracer is useful in several situations:
1331
1332- you want to find the reason of a strange kernel behavior and
1333 need to see what happens in detail on any areas (or specific
1334 ones).
1335
1336- you are experiencing weird latencies but it's difficult to
1337 find its origin.
1338
1339- you want to find quickly which path is taken by a specific
1340 function
1341
1342- you just want to peek inside a working kernel and want to see
1343 what happens there.
1344
1345# tracer: function_graph
1346#
1347# CPU DURATION FUNCTION CALLS
1348# | | | | | | |
1349
1350 0) | sys_open() {
1351 0) | do_sys_open() {
1352 0) | getname() {
1353 0) | kmem_cache_alloc() {
1354 0) 1.382 us | __might_sleep();
1355 0) 2.478 us | }
1356 0) | strncpy_from_user() {
1357 0) | might_fault() {
1358 0) 1.389 us | __might_sleep();
1359 0) 2.553 us | }
1360 0) 3.807 us | }
1361 0) 7.876 us | }
1362 0) | alloc_fd() {
1363 0) 0.668 us | _spin_lock();
1364 0) 0.570 us | expand_files();
1365 0) 0.586 us | _spin_unlock();
1366
1367
1368There are several columns that can be dynamically
1369enabled/disabled. You can use every combination of options you
1370want, depending on your needs.
1371
1372- The cpu number on which the function executed is default
1373 enabled. It is sometimes better to only trace one cpu (see
1374 tracing_cpu_mask file) or you might sometimes see unordered
1375 function calls while cpu tracing switch.
1376
1377 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options
1378 show: echo funcgraph-cpu > /debug/tracing/trace_options
1379
1380- The duration (function's time of execution) is displayed on
1381 the closing bracket line of a function or on the same line
1382 than the current function in case of a leaf one. It is default
1383 enabled.
1384
1385 hide: echo nofuncgraph-duration > /debug/tracing/trace_options
1386 show: echo funcgraph-duration > /debug/tracing/trace_options
1387
1388- The overhead field precedes the duration field in case of
1389 reached duration thresholds.
1390
1391 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options
1392 show: echo funcgraph-overhead > /debug/tracing/trace_options
1393 depends on: funcgraph-duration
1394
1395 ie:
1396
1397 0) | up_write() {
1398 0) 0.646 us | _spin_lock_irqsave();
1399 0) 0.684 us | _spin_unlock_irqrestore();
1400 0) 3.123 us | }
1401 0) 0.548 us | fput();
1402 0) + 58.628 us | }
1403
1404 [...]
1405
1406 0) | putname() {
1407 0) | kmem_cache_free() {
1408 0) 0.518 us | __phys_addr();
1409 0) 1.757 us | }
1410 0) 2.861 us | }
1411 0) ! 115.305 us | }
1412 0) ! 116.402 us | }
1413
1414 + means that the function exceeded 10 usecs.
1415 ! means that the function exceeded 100 usecs.
1416
1417
1418- The task/pid field displays the thread cmdline and pid which
1419 executed the function. It is default disabled.
1420
1421 hide: echo nofuncgraph-proc > /debug/tracing/trace_options
1422 show: echo funcgraph-proc > /debug/tracing/trace_options
1423
1424 ie:
1425
1426 # tracer: function_graph
1427 #
1428 # CPU TASK/PID DURATION FUNCTION CALLS
1429 # | | | | | | | | |
1430 0) sh-4802 | | d_free() {
1431 0) sh-4802 | | call_rcu() {
1432 0) sh-4802 | | __call_rcu() {
1433 0) sh-4802 | 0.616 us | rcu_process_gp_end();
1434 0) sh-4802 | 0.586 us | check_for_new_grace_period();
1435 0) sh-4802 | 2.899 us | }
1436 0) sh-4802 | 4.040 us | }
1437 0) sh-4802 | 5.151 us | }
1438 0) sh-4802 | + 49.370 us | }
1439
1440
1441- The absolute time field is an absolute timestamp given by the
1442 system clock since it started. A snapshot of this time is
1443 given on each entry/exit of functions
1444
1445 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options
1446 show: echo funcgraph-abstime > /debug/tracing/trace_options
1447
1448 ie:
1449
1450 #
1451 # TIME CPU DURATION FUNCTION CALLS
1452 # | | | | | | | |
1453 360.774522 | 1) 0.541 us | }
1454 360.774522 | 1) 4.663 us | }
1455 360.774523 | 1) 0.541 us | __wake_up_bit();
1456 360.774524 | 1) 6.796 us | }
1457 360.774524 | 1) 7.952 us | }
1458 360.774525 | 1) 9.063 us | }
1459 360.774525 | 1) 0.615 us | journal_mark_dirty();
1460 360.774527 | 1) 0.578 us | __brelse();
1461 360.774528 | 1) | reiserfs_prepare_for_journal() {
1462 360.774528 | 1) | unlock_buffer() {
1463 360.774529 | 1) | wake_up_bit() {
1464 360.774529 | 1) | bit_waitqueue() {
1465 360.774530 | 1) 0.594 us | __phys_addr();
1466
1467
1468You can put some comments on specific functions by using
1469trace_printk() For example, if you want to put a comment inside
1470the __might_sleep() function, you just have to include
1471<linux/ftrace.h> and call trace_printk() inside __might_sleep()
1472
1473trace_printk("I'm a comment!\n")
1474
1475will produce:
1476
1477 1) | __might_sleep() {
1478 1) | /* I'm a comment! */
1479 1) 1.449 us | }
1480
1481
1482You might find other useful features for this tracer in the
1483following "dynamic ftrace" section such as tracing only specific
1484functions or tasks.
1485
1155dynamic ftrace 1486dynamic ftrace
1156-------------- 1487--------------
1157 1488
1158If CONFIG_DYNAMIC_FTRACE is set, the system will run with 1489If CONFIG_DYNAMIC_FTRACE is set, the system will run with
1159virtually no overhead when function tracing is disabled. The way 1490virtually no overhead when function tracing is disabled. The way
1160this works is the mcount function call (placed at the start of 1491this works is the mcount function call (placed at the start of
1161every kernel function, produced by the -pg switch in gcc), starts 1492every kernel function, produced by the -pg switch in gcc),
1162of pointing to a simple return. (Enabling FTRACE will include the 1493starts of pointing to a simple return. (Enabling FTRACE will
1163-pg switch in the compiling of the kernel.) 1494include the -pg switch in the compiling of the kernel.)
1164 1495
1165At compile time every C file object is run through the 1496At compile time every C file object is run through the
1166recordmcount.pl script (located in the scripts directory). This 1497recordmcount.pl script (located in the scripts directory). This
1167script will process the C object using objdump to find all the 1498script will process the C object using objdump to find all the
1168locations in the .text section that call mcount. (Note, only 1499locations in the .text section that call mcount. (Note, only the
1169the .text section is processed, since processing other sections 1500.text section is processed, since processing other sections like
1170like .init.text may cause races due to those sections being freed). 1501.init.text may cause races due to those sections being freed).
1171 1502
1172A new section called "__mcount_loc" is created that holds references 1503A new section called "__mcount_loc" is created that holds
1173to all the mcount call sites in the .text section. This section is 1504references to all the mcount call sites in the .text section.
1174compiled back into the original object. The final linker will add 1505This section is compiled back into the original object. The
1175all these references into a single table. 1506final linker will add all these references into a single table.
1176 1507
1177On boot up, before SMP is initialized, the dynamic ftrace code 1508On boot up, before SMP is initialized, the dynamic ftrace code
1178scans this table and updates all the locations into nops. It also 1509scans this table and updates all the locations into nops. It
1179records the locations, which are added to the available_filter_functions 1510also records the locations, which are added to the
1180list. Modules are processed as they are loaded and before they are 1511available_filter_functions list. Modules are processed as they
1181executed. When a module is unloaded, it also removes its functions from 1512are loaded and before they are executed. When a module is
1182the ftrace function list. This is automatic in the module unload 1513unloaded, it also removes its functions from the ftrace function
1183code, and the module author does not need to worry about it. 1514list. This is automatic in the module unload code, and the
1184 1515module author does not need to worry about it.
1185When tracing is enabled, kstop_machine is called to prevent races 1516
1186with the CPUS executing code being modified (which can cause the 1517When tracing is enabled, kstop_machine is called to prevent
1187CPU to do undesireable things), and the nops are patched back 1518races with the CPUS executing code being modified (which can
1188to calls. But this time, they do not call mcount (which is just 1519cause the CPU to do undesireable things), and the nops are
1189a function stub). They now call into the ftrace infrastructure. 1520patched back to calls. But this time, they do not call mcount
1521(which is just a function stub). They now call into the ftrace
1522infrastructure.
1190 1523
1191One special side-effect to the recording of the functions being 1524One special side-effect to the recording of the functions being
1192traced is that we can now selectively choose which functions we 1525traced is that we can now selectively choose which functions we
1193wish to trace and which ones we want the mcount calls to remain as 1526wish to trace and which ones we want the mcount calls to remain
1194nops. 1527as nops.
1195 1528
1196Two files are used, one for enabling and one for disabling the tracing 1529Two files are used, one for enabling and one for disabling the
1197of specified functions. They are: 1530tracing of specified functions. They are:
1198 1531
1199 set_ftrace_filter 1532 set_ftrace_filter
1200 1533
@@ -1202,8 +1535,8 @@ and
1202 1535
1203 set_ftrace_notrace 1536 set_ftrace_notrace
1204 1537
1205A list of available functions that you can add to these files is listed 1538A list of available functions that you can add to these files is
1206in: 1539listed in:
1207 1540
1208 available_filter_functions 1541 available_filter_functions
1209 1542
@@ -1240,8 +1573,8 @@ hrtimer_interrupt
1240sys_nanosleep 1573sys_nanosleep
1241 1574
1242 1575
1243Perhaps this is not enough. The filters also allow simple wild cards. 1576Perhaps this is not enough. The filters also allow simple wild
1244Only the following are currently available 1577cards. Only the following are currently available
1245 1578
1246 <match>* - will match functions that begin with <match> 1579 <match>* - will match functions that begin with <match>
1247 *<match> - will match functions that end with <match> 1580 *<match> - will match functions that end with <match>
@@ -1251,9 +1584,9 @@ These are the only wild cards which are supported.
1251 1584
1252 <match>*<match> will not work. 1585 <match>*<match> will not work.
1253 1586
1254Note: It is better to use quotes to enclose the wild cards, otherwise 1587Note: It is better to use quotes to enclose the wild cards,
1255 the shell may expand the parameters into names of files in the local 1588 otherwise the shell may expand the parameters into names
1256 directory. 1589 of files in the local directory.
1257 1590
1258 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1591 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
1259 1592
@@ -1299,7 +1632,8 @@ This is because the '>' and '>>' act just like they do in bash.
1299To rewrite the filters, use '>' 1632To rewrite the filters, use '>'
1300To append to the filters, use '>>' 1633To append to the filters, use '>>'
1301 1634
1302To clear out a filter so that all functions will be recorded again: 1635To clear out a filter so that all functions will be recorded
1636again:
1303 1637
1304 # echo > /debug/tracing/set_ftrace_filter 1638 # echo > /debug/tracing/set_ftrace_filter
1305 # cat /debug/tracing/set_ftrace_filter 1639 # cat /debug/tracing/set_ftrace_filter
@@ -1331,7 +1665,8 @@ hrtimer_get_res
1331hrtimer_init_sleeper 1665hrtimer_init_sleeper
1332 1666
1333 1667
1334The set_ftrace_notrace prevents those functions from being traced. 1668The set_ftrace_notrace prevents those functions from being
1669traced.
1335 1670
1336 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1671 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
1337 1672
@@ -1353,13 +1688,75 @@ Produces:
1353 1688
1354We can see that there's no more lock or preempt tracing. 1689We can see that there's no more lock or preempt tracing.
1355 1690
1691
1692Dynamic ftrace with the function graph tracer
1693---------------------------------------------
1694
1695Although what has been explained above concerns both the
1696function tracer and the function-graph-tracer, there are some
1697special features only available in the function-graph tracer.
1698
1699If you want to trace only one function and all of its children,
1700you just have to echo its name into set_graph_function:
1701
1702 echo __do_fault > set_graph_function
1703
1704will produce the following "expanded" trace of the __do_fault()
1705function:
1706
1707 0) | __do_fault() {
1708 0) | filemap_fault() {
1709 0) | find_lock_page() {
1710 0) 0.804 us | find_get_page();
1711 0) | __might_sleep() {
1712 0) 1.329 us | }
1713 0) 3.904 us | }
1714 0) 4.979 us | }
1715 0) 0.653 us | _spin_lock();
1716 0) 0.578 us | page_add_file_rmap();
1717 0) 0.525 us | native_set_pte_at();
1718 0) 0.585 us | _spin_unlock();
1719 0) | unlock_page() {
1720 0) 0.541 us | page_waitqueue();
1721 0) 0.639 us | __wake_up_bit();
1722 0) 2.786 us | }
1723 0) + 14.237 us | }
1724 0) | __do_fault() {
1725 0) | filemap_fault() {
1726 0) | find_lock_page() {
1727 0) 0.698 us | find_get_page();
1728 0) | __might_sleep() {
1729 0) 1.412 us | }
1730 0) 3.950 us | }
1731 0) 5.098 us | }
1732 0) 0.631 us | _spin_lock();
1733 0) 0.571 us | page_add_file_rmap();
1734 0) 0.526 us | native_set_pte_at();
1735 0) 0.586 us | _spin_unlock();
1736 0) | unlock_page() {
1737 0) 0.533 us | page_waitqueue();
1738 0) 0.638 us | __wake_up_bit();
1739 0) 2.793 us | }
1740 0) + 14.012 us | }
1741
1742You can also expand several functions at once:
1743
1744 echo sys_open > set_graph_function
1745 echo sys_close >> set_graph_function
1746
1747Now if you want to go back to trace all functions you can clear
1748this special filter via:
1749
1750 echo > set_graph_function
1751
1752
1356trace_pipe 1753trace_pipe
1357---------- 1754----------
1358 1755
1359The trace_pipe outputs the same content as the trace file, but the effect 1756The trace_pipe outputs the same content as the trace file, but
1360on the tracing is different. Every read from trace_pipe is consumed. 1757the effect on the tracing is different. Every read from
1361This means that subsequent reads will be different. The trace 1758trace_pipe is consumed. This means that subsequent reads will be
1362is live. 1759different. The trace is live.
1363 1760
1364 # echo function > /debug/tracing/current_tracer 1761 # echo function > /debug/tracing/current_tracer
1365 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1762 # cat /debug/tracing/trace_pipe > /tmp/trace.out &
@@ -1387,38 +1784,45 @@ is live.
1387 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up 1784 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
1388 1785
1389 1786
1390Note, reading the trace_pipe file will block until more input is added. 1787Note, reading the trace_pipe file will block until more input is
1391By changing the tracer, trace_pipe will issue an EOF. We needed 1788added. By changing the tracer, trace_pipe will issue an EOF. We
1392to set the function tracer _before_ we "cat" the trace_pipe file. 1789needed to set the function tracer _before_ we "cat" the
1790trace_pipe file.
1393 1791
1394 1792
1395trace entries 1793trace entries
1396------------- 1794-------------
1397 1795
1398Having too much or not enough data can be troublesome in diagnosing 1796Having too much or not enough data can be troublesome in
1399an issue in the kernel. The file buffer_size_kb is used to modify 1797diagnosing an issue in the kernel. The file buffer_size_kb is
1400the size of the internal trace buffers. The number listed 1798used to modify the size of the internal trace buffers. The
1401is the number of entries that can be recorded per CPU. To know 1799number listed is the number of entries that can be recorded per
1402the full size, multiply the number of possible CPUS with the 1800CPU. To know the full size, multiply the number of possible CPUS
1403number of entries. 1801with the number of entries.
1404 1802
1405 # cat /debug/tracing/buffer_size_kb 1803 # cat /debug/tracing/buffer_size_kb
14061408 (units kilobytes) 18041408 (units kilobytes)
1407 1805
1408Note, to modify this, you must have tracing completely disabled. To do that, 1806Note, to modify this, you must have tracing completely disabled.
1409echo "nop" into the current_tracer. If the current_tracer is not set 1807To do that, echo "nop" into the current_tracer. If the
1410to "nop", an EINVAL error will be returned. 1808current_tracer is not set to "nop", an EINVAL error will be
1809returned.
1411 1810
1412 # echo nop > /debug/tracing/current_tracer 1811 # echo nop > /debug/tracing/current_tracer
1413 # echo 10000 > /debug/tracing/buffer_size_kb 1812 # echo 10000 > /debug/tracing/buffer_size_kb
1414 # cat /debug/tracing/buffer_size_kb 1813 # cat /debug/tracing/buffer_size_kb
141510000 (units kilobytes) 181410000 (units kilobytes)
1416 1815
1417The number of pages which will be allocated is limited to a percentage 1816The number of pages which will be allocated is limited to a
1418of available memory. Allocating too much will produce an error. 1817percentage of available memory. Allocating too much will produce
1818an error.
1419 1819
1420 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1820 # echo 1000000000000 > /debug/tracing/buffer_size_kb
1421-bash: echo: write error: Cannot allocate memory 1821-bash: echo: write error: Cannot allocate memory
1422 # cat /debug/tracing/buffer_size_kb 1822 # cat /debug/tracing/buffer_size_kb
142385 182385
1424 1824
1825-----------
1826
1827More details can be found in the source code, in the
1828kernel/tracing/*.c files.
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index b1b988701247..145c25a170c7 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -123,7 +123,10 @@ platform-specific implementation issue.
123 123
124Using GPIOs 124Using GPIOs
125----------- 125-----------
126One of the first things to do with a GPIO, often in board setup code when 126The first thing a system should do with a GPIO is allocate it, using
127the gpio_request() call; see later.
128
129One of the next things to do with a GPIO, often in board setup code when
127setting up a platform_device using the GPIO, is mark its direction: 130setting up a platform_device using the GPIO, is mark its direction:
128 131
129 /* set as input or output, returning 0 or negative errno */ 132 /* set as input or output, returning 0 or negative errno */
@@ -141,8 +144,8 @@ This helps avoid signal glitching during system startup.
141 144
142For compatibility with legacy interfaces to GPIOs, setting the direction 145For compatibility with legacy interfaces to GPIOs, setting the direction
143of a GPIO implicitly requests that GPIO (see below) if it has not been 146of a GPIO implicitly requests that GPIO (see below) if it has not been
144requested already. That compatibility may be removed in the future; 147requested already. That compatibility is being removed from the optional
145explicitly requesting GPIOs is strongly preferred. 148gpiolib framework.
146 149
147Setting the direction can fail if the GPIO number is invalid, or when 150Setting the direction can fail if the GPIO number is invalid, or when
148that particular GPIO can't be used in that mode. It's generally a bad 151that particular GPIO can't be used in that mode. It's generally a bad
@@ -195,7 +198,7 @@ This requires sleeping, which can't be done from inside IRQ handlers.
195 198
196Platforms that support this type of GPIO distinguish them from other GPIOs 199Platforms that support this type of GPIO distinguish them from other GPIOs
197by returning nonzero from this call (which requires a valid GPIO number, 200by returning nonzero from this call (which requires a valid GPIO number,
198either explicitly or implicitly requested): 201which should have been previously allocated with gpio_request):
199 202
200 int gpio_cansleep(unsigned gpio); 203 int gpio_cansleep(unsigned gpio);
201 204
@@ -212,10 +215,9 @@ for GPIOs that can't be accessed from IRQ handlers, these calls act the
212same as the spinlock-safe calls. 215same as the spinlock-safe calls.
213 216
214 217
215Claiming and Releasing GPIOs (OPTIONAL) 218Claiming and Releasing GPIOs
216--------------------------------------- 219----------------------------
217To help catch system configuration errors, two calls are defined. 220To help catch system configuration errors, two calls are defined.
218However, many platforms don't currently support this mechanism.
219 221
220 /* request GPIO, returning 0 or negative errno. 222 /* request GPIO, returning 0 or negative errno.
221 * non-null labels may be useful for diagnostics. 223 * non-null labels may be useful for diagnostics.
@@ -244,13 +246,6 @@ Some platforms may also use knowledge about what GPIOs are active for
244power management, such as by powering down unused chip sectors and, more 246power management, such as by powering down unused chip sectors and, more
245easily, gating off unused clocks. 247easily, gating off unused clocks.
246 248
247These two calls are optional because not not all current Linux platforms
248offer such functionality in their GPIO support; a valid implementation
249could return success for all gpio_request() calls. Unlike the other calls,
250the state they represent doesn't normally match anything from a hardware
251register; it's just a software bitmap which clearly is not necessary for
252correct operation of hardware or (bug free) drivers.
253
254Note that requesting a GPIO does NOT cause it to be configured in any 249Note that requesting a GPIO does NOT cause it to be configured in any
255way; it just marks that GPIO as in use. Separate code must handle any 250way; it just marks that GPIO as in use. Separate code must handle any
256pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown). 251pin setup (e.g. controlling which pin the GPIO uses, pullup/pulldown).
diff --git a/Documentation/hwmon/ds1621 b/Documentation/hwmon/ds1621
index 1fee6f1e6bc5..5e97f333c4df 100644
--- a/Documentation/hwmon/ds1621
+++ b/Documentation/hwmon/ds1621
@@ -49,12 +49,9 @@ of up to +/- 0.5 degrees even when compared against precise temperature
49readings. Be sure to have a high vs. low temperature limit gap of al least 49readings. Be sure to have a high vs. low temperature limit gap of al least
501.0 degree Celsius to avoid Tout "bouncing", though! 501.0 degree Celsius to avoid Tout "bouncing", though!
51 51
52As for alarms, you can read the alarm status of the DS1621 via the 'alarms' 52The alarm bits are set when the high or low limits are met or exceeded and
53/sys file interface. The result consists mainly of bit 6 and 5 of the 53are reset by the module as soon as the respective temperature ranges are
54configuration register of the chip; bit 6 (0x40 or 64) is the high alarm 54left.
55bit and bit 5 (0x20 or 32) the low one. These bits are set when the high or
56low limits are met or exceeded and are reset by the module as soon as the
57respective temperature ranges are left.
58 55
59The alarm registers are in no way suitable to find out about the actual 56The alarm registers are in no way suitable to find out about the actual
60status of Tout. They will only tell you about its history, whether or not 57status of Tout. They will only tell you about its history, whether or not
@@ -64,45 +61,3 @@ with neither of the alarms set.
64 61
65Temperature conversion of the DS1621 takes up to 1000ms; internal access to 62Temperature conversion of the DS1621 takes up to 1000ms; internal access to
66non-volatile registers may last for 10ms or below. 63non-volatile registers may last for 10ms or below.
67
68High Accuracy Temperature Reading
69---------------------------------
70
71As said before, the temperature issued via the 9-bit i2c-bus data is
72somewhat arbitrary. Internally, the temperature conversion is of a
73different kind that is explained (not so...) well in the DS1621 data sheet.
74To cut the long story short: Inside the DS1621 there are two oscillators,
75both of them biassed by a temperature coefficient.
76
77Higher resolution of the temperature reading can be achieved using the
78internal projection, which means taking account of REG_COUNT and REG_SLOPE
79(the driver manages them):
80
81Taken from Dallas Semiconductors App Note 068: 'Increasing Temperature
82Resolution on the DS1620' and App Note 105: 'High Resolution Temperature
83Measurement with Dallas Direct-to-Digital Temperature Sensors'
84
85- Read the 9-bit temperature and strip the LSB (Truncate the .5 degs)
86- The resulting value is TEMP_READ.
87- Then, read REG_COUNT.
88- And then, REG_SLOPE.
89
90 TEMP = TEMP_READ - 0.25 + ((REG_SLOPE - REG_COUNT) / REG_SLOPE)
91
92Note that this is what the DONE bit in the DS1621 configuration register is
93good for: Internally, one temperature conversion takes up to 1000ms. Before
94that conversion is complete you will not be able to read valid things out
95of REG_COUNT and REG_SLOPE. The DONE bit, as you may have guessed by now,
96tells you whether the conversion is complete ("done", in plain English) and
97thus, whether the values you read are good or not.
98
99The DS1621 has two modes of operation: "Continuous" conversion, which can
100be understood as the default stand-alone mode where the chip gets the
101temperature and controls external devices via its Tout pin or tells other
102i2c's about it if they care. The other mode is called "1SHOT", that means
103that it only figures out about the temperature when it is explicitly told
104to do so; this can be seen as power saving mode.
105
106Now if you want to read REG_COUNT and REG_SLOPE, you have to either stop
107the continuous conversions until the contents of these registers are valid,
108or, in 1SHOT mode, you have to have one conversion made.
diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d
index 287f8c902656..effe949a7282 100644
--- a/Documentation/hwmon/lis3lv02d
+++ b/Documentation/hwmon/lis3lv02d
@@ -1,11 +1,11 @@
1Kernel driver lis3lv02d 1Kernel driver lis3lv02d
2================== 2=======================
3 3
4Supported chips: 4Supported chips:
5 5
6 * STMicroelectronics LIS3LV02DL and LIS3LV02DQ 6 * STMicroelectronics LIS3LV02DL and LIS3LV02DQ
7 7
8Author: 8Authors:
9 Yan Burman <burman.yan@gmail.com> 9 Yan Burman <burman.yan@gmail.com>
10 Eric Piel <eric.piel@tremplin-utc.net> 10 Eric Piel <eric.piel@tremplin-utc.net>
11 11
@@ -15,7 +15,7 @@ Description
15 15
16This driver provides support for the accelerometer found in various HP 16This driver provides support for the accelerometer found in various HP
17laptops sporting the feature officially called "HP Mobile Data 17laptops sporting the feature officially called "HP Mobile Data
18Protection System 3D" or "HP 3D DriveGuard". It detect automatically 18Protection System 3D" or "HP 3D DriveGuard". It detects automatically
19laptops with this sensor. Known models (for now the HP 2133, nc6420, 19laptops with this sensor. Known models (for now the HP 2133, nc6420,
20nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis 20nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis
21automatically oriented on standard way (eg: you can directly play 21automatically oriented on standard way (eg: you can directly play
@@ -27,7 +27,7 @@ position - 3D position that the accelerometer reports. Format: "(x,y,z)"
27calibrate - read: values (x, y, z) that are used as the base for input 27calibrate - read: values (x, y, z) that are used as the base for input
28 class device operation. 28 class device operation.
29 write: forces the base to be recalibrated with the current 29 write: forces the base to be recalibrated with the current
30 position. 30 position.
31rate - reports the sampling rate of the accelerometer device in HZ 31rate - reports the sampling rate of the accelerometer device in HZ
32 32
33This driver also provides an absolute input class device, allowing 33This driver also provides an absolute input class device, allowing
@@ -48,7 +48,7 @@ For better compatibility between the various laptops. The values reported by
48the accelerometer are converted into a "standard" organisation of the axes 48the accelerometer are converted into a "standard" organisation of the axes
49(aka "can play neverball out of the box"): 49(aka "can play neverball out of the box"):
50 * When the laptop is horizontal the position reported is about 0 for X and Y 50 * When the laptop is horizontal the position reported is about 0 for X and Y
51and a positive value for Z 51 and a positive value for Z
52 * If the left side is elevated, X increases (becomes positive) 52 * If the left side is elevated, X increases (becomes positive)
53 * If the front side (where the touchpad is) is elevated, Y decreases 53 * If the front side (where the touchpad is) is elevated, Y decreases
54 (becomes negative) 54 (becomes negative)
@@ -59,3 +59,13 @@ email to the authors to add it to the database. When reporting a new
59laptop, please include the output of "dmidecode" plus the value of 59laptop, please include the output of "dmidecode" plus the value of
60/sys/devices/platform/lis3lv02d/position in these four cases. 60/sys/devices/platform/lis3lv02d/position in these four cases.
61 61
62Q&A
63---
64
65Q: How do I safely simulate freefall? I have an HP "portable
66workstation" which has about 3.5kg and a plastic case, so letting it
67fall to the ground is out of question...
68
69A: The sensor is pretty sensitive, so your hands can do it. Lift it
70into free space, follow the fall with your hands for like 10
71centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/hwmon/ltc4215 b/Documentation/hwmon/ltc4215
new file mode 100644
index 000000000000..2e6a21eb656c
--- /dev/null
+++ b/Documentation/hwmon/ltc4215
@@ -0,0 +1,50 @@
1Kernel driver ltc4215
2=====================
3
4Supported chips:
5 * Linear Technology LTC4215
6 Prefix: 'ltc4215'
7 Addresses scanned: 0x44
8 Datasheet:
9 http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1163,P17572,D12697
10
11Author: Ira W. Snyder <iws@ovro.caltech.edu>
12
13
14Description
15-----------
16
17The LTC4215 controller allows a board to be safely inserted and removed
18from a live backplane.
19
20
21Usage Notes
22-----------
23
24This driver does not probe for LTC4215 devices, due to the fact that some
25of the possible addresses are unfriendly to probing. You will need to use
26the "force" parameter to tell the driver where to find the device.
27
28Example: the following will load the driver for an LTC4215 at address 0x44
29on I2C bus #0:
30$ modprobe ltc4215 force=0,0x44
31
32
33Sysfs entries
34-------------
35
36The LTC4215 has built-in limits for overvoltage, undervoltage, and
37undercurrent warnings. This makes it very likely that the reference
38circuit will be used.
39
40in1_input input voltage
41in2_input output voltage
42
43in1_min_alarm input undervoltage alarm
44in1_max_alarm input overvoltage alarm
45
46curr1_input current
47curr1_max_alarm overcurrent alarm
48
49power1_input power usage
50power1_alarm power bad alarm
diff --git a/Documentation/i2c/chips/pcf8591 b/Documentation/hwmon/pcf8591
index 5628fcf4207f..5628fcf4207f 100644
--- a/Documentation/i2c/chips/pcf8591
+++ b/Documentation/hwmon/pcf8591
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 6dbfd5efd991..2f10ce6a879f 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -365,6 +365,7 @@ energy[1-*]_input Cumulative energy use
365 Unit: microJoule 365 Unit: microJoule
366 RO 366 RO
367 367
368
368********** 369**********
369* Alarms * 370* Alarms *
370********** 371**********
@@ -453,6 +454,27 @@ beep_mask Bitmask for beep.
453 RW 454 RW
454 455
455 456
457***********************
458* Intrusion detection *
459***********************
460
461intrusion[0-*]_alarm
462 Chassis intrusion detection
463 0: OK
464 1: intrusion detected
465 RW
466 Contrary to regular alarm flags which clear themselves
467 automatically when read, this one sticks until cleared by
468 the user. This is done by writing 0 to the file. Writing
469 other values is unsupported.
470
471intrusion[0-*]_beep
472 Chassis intrusion beep
473 0: disable
474 1: enable
475 RW
476
477
456sysfs attribute writes interpretation 478sysfs attribute writes interpretation
457------------------------------------- 479-------------------------------------
458 480
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index d6e1ae30fa6e..b6eb59384bb3 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -2,30 +2,40 @@ Kernel driver w83627ehf
2======================= 2=======================
3 3
4Supported chips: 4Supported chips:
5 * Winbond W83627EHF/EHG/DHG (ISA access ONLY) 5 * Winbond W83627EHF/EHG (ISA access ONLY)
6 Prefix: 'w83627ehf' 6 Prefix: 'w83627ehf'
7 Addresses scanned: ISA address retrieved from Super I/O registers 7 Addresses scanned: ISA address retrieved from Super I/O registers
8 Datasheet: 8 Datasheet:
9 http://www.winbond-usa.com/products/winbond_products/pdfs/PCIC/W83627EHF_%20W83627EHGb.pdf 9 http://www.nuvoton.com.tw/NR/rdonlyres/A6A258F0-F0C9-4F97-81C0-C4D29E7E943E/0/W83627EHF.pdf
10 DHG datasheet confidential. 10 * Winbond W83627DHG
11 Prefix: 'w83627dhg'
12 Addresses scanned: ISA address retrieved from Super I/O registers
13 Datasheet:
14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf
15 * Winbond W83667HG
16 Prefix: 'w83667hg'
17 Addresses scanned: ISA address retrieved from Super I/O registers
18 Datasheet: not available
11 19
12Authors: 20Authors:
13 Jean Delvare <khali@linux-fr.org> 21 Jean Delvare <khali@linux-fr.org>
14 Yuan Mu (Winbond) 22 Yuan Mu (Winbond)
15 Rudolf Marek <r.marek@assembler.cz> 23 Rudolf Marek <r.marek@assembler.cz>
16 David Hubbard <david.c.hubbard@gmail.com> 24 David Hubbard <david.c.hubbard@gmail.com>
25 Gong Jun <JGong@nuvoton.com>
17 26
18Description 27Description
19----------- 28-----------
20 29
21This driver implements support for the Winbond W83627EHF, W83627EHG, and 30This driver implements support for the Winbond W83627EHF, W83627EHG,
22W83627DHG super I/O chips. We will refer to them collectively as Winbond chips. 31W83627DHG and W83667HG super I/O chips. We will refer to them collectively
32as Winbond chips.
23 33
24The chips implement three temperature sensors, five fan rotation 34The chips implement three temperature sensors, five fan rotation
25speed sensors, ten analog voltage sensors (only nine for the 627DHG), one 35speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
26VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG), alarms with beep 36VID (6 pins for the 627EHF/EHG, 8 pins for the 627DHG and 667HG), alarms
27warnings (control unimplemented), and some automatic fan regulation 37with beep warnings (control unimplemented), and some automatic fan
28strategies (plus manual fan control mode). 38regulation strategies (plus manual fan control mode).
29 39
30Temperatures are measured in degrees Celsius and measurement resolution is 1 40Temperatures are measured in degrees Celsius and measurement resolution is 1
31degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when 41degC for temp1 and 0.5 degC for temp2 and temp3. An alarm is triggered when
@@ -54,7 +64,8 @@ follows:
54temp1 -> pwm1 64temp1 -> pwm1
55temp2 -> pwm2 65temp2 -> pwm2
56temp3 -> pwm3 66temp3 -> pwm3
57prog -> pwm4 (the programmable setting is not supported by the driver) 67prog -> pwm4 (not on 667HG; the programmable setting is not supported by
68 the driver)
58 69
59/sys files 70/sys files
60---------- 71----------
diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
42 hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg 42 hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
43 you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries. 43 you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
44 44
45 (3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu 45 (3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
46 46
474. Boot up Linux or Windows guests: 474. Boot up Linux or Windows guests:
48 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy. 48 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index f1d639903325..1f779a25c703 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -122,10 +122,8 @@ Code Seq# Include File Comments
122'c' 00-7F linux/coda.h conflict! 122'c' 00-7F linux/coda.h conflict!
123'c' 80-9F arch/s390/include/asm/chsc.h 123'c' 80-9F arch/s390/include/asm/chsc.h
124'd' 00-FF linux/char/drm/drm/h conflict! 124'd' 00-FF linux/char/drm/drm/h conflict!
125'd' 00-DF linux/video_decoder.h conflict!
126'd' F0-FF linux/digi1.h 125'd' F0-FF linux/digi1.h
127'e' all linux/digi1.h conflict! 126'e' all linux/digi1.h conflict!
128'e' 00-1F linux/video_encoder.h conflict!
129'e' 00-1F net/irda/irtty.h conflict! 127'e' 00-1F net/irda/irtty.h conflict!
130'f' 00-1F linux/ext2_fs.h 128'f' 00-1F linux/ext2_fs.h
131'h' 00-7F Charon filesystem 129'h' 00-7F Charon filesystem
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d1e2fcb6298b..6172e4360f60 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -50,6 +50,7 @@ parameter is applicable:
50 ISAPNP ISA PnP code is enabled. 50 ISAPNP ISA PnP code is enabled.
51 ISDN Appropriate ISDN support is enabled. 51 ISDN Appropriate ISDN support is enabled.
52 JOY Appropriate joystick support is enabled. 52 JOY Appropriate joystick support is enabled.
53 KMEMTRACE kmemtrace is enabled.
53 LIBATA Libata driver is enabled 54 LIBATA Libata driver is enabled
54 LP Printer support is enabled. 55 LP Printer support is enabled.
55 LOOP Loopback device support is enabled. 56 LOOP Loopback device support is enabled.
@@ -152,60 +153,6 @@ and is between 256 and 4096 characters. It is defined in the file
152 1,0: use 1st APIC table 153 1,0: use 1st APIC table
153 default: 0 154 default: 0
154 155
155 acpi_sleep= [HW,ACPI] Sleep options
156 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
157 old_ordering, s4_nonvs }
158 See Documentation/power/video.txt for information on
159 s3_bios and s3_mode.
160 s3_beep is for debugging; it makes the PC's speaker beep
161 as soon as the kernel's real-mode entry point is called.
162 s4_nohwsig prevents ACPI hardware signature from being
163 used during resume from hibernation.
164 old_ordering causes the ACPI 1.0 ordering of the _PTS
165 control method, with respect to putting devices into
166 low power states, to be enforced (the ACPI 2.0 ordering
167 of _PTS is used by default).
168 s4_nonvs prevents the kernel from saving/restoring the
169 ACPI NVS memory during hibernation.
170
171 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
172 Format: { level | edge | high | low }
173
174 acpi_irq_balance [HW,ACPI]
175 ACPI will balance active IRQs
176 default in APIC mode
177
178 acpi_irq_nobalance [HW,ACPI]
179 ACPI will not move active IRQs (default)
180 default in PIC mode
181
182 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
183 use by PCI
184 Format: <irq>,<irq>...
185
186 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
187 Format: <irq>,<irq>...
188
189 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
190
191 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
192 Format: To spoof as Windows 98: ="Microsoft Windows"
193
194 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
195 acpi_osi="string1" # add string1 -- only one string
196 acpi_osi="!string2" # remove built-in string2
197 acpi_osi= # disable all strings
198
199 acpi_serialize [HW,ACPI] force serialization of AML methods
200
201 acpi_skip_timer_override [HW,ACPI]
202 Recognize and ignore IRQ0/pin2 Interrupt Override.
203 For broken nForce2 BIOS resulting in XT-PIC timer.
204 acpi_use_timer_override [HW,ACPI]
205 Use timer override. For some broken Nvidia NF5 boards
206 that require a timer override, but don't have
207 HPET
208
209 acpi_backlight= [HW,ACPI] 156 acpi_backlight= [HW,ACPI]
210 acpi_backlight=vendor 157 acpi_backlight=vendor
211 acpi_backlight=video 158 acpi_backlight=video
@@ -213,11 +160,6 @@ and is between 256 and 4096 characters. It is defined in the file
213 (e.g. thinkpad_acpi, sony_acpi, etc.) instead 160 (e.g. thinkpad_acpi, sony_acpi, etc.) instead
214 of the ACPI video.ko driver. 161 of the ACPI video.ko driver.
215 162
216 acpi_display_output= [HW,ACPI]
217 acpi_display_output=vendor
218 acpi_display_output=video
219 See above.
220
221 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] 163 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG]
222 acpi.debug_level= [HW,ACPI,ACPI_DEBUG] 164 acpi.debug_level= [HW,ACPI,ACPI_DEBUG]
223 Format: <int> 165 Format: <int>
@@ -246,6 +188,41 @@ and is between 256 and 4096 characters. It is defined in the file
246 unusable. The "log_buf_len" parameter may be useful 188 unusable. The "log_buf_len" parameter may be useful
247 if you need to capture more output. 189 if you need to capture more output.
248 190
191 acpi_display_output= [HW,ACPI]
192 acpi_display_output=vendor
193 acpi_display_output=video
194 See above.
195
196 acpi_irq_balance [HW,ACPI]
197 ACPI will balance active IRQs
198 default in APIC mode
199
200 acpi_irq_nobalance [HW,ACPI]
201 ACPI will not move active IRQs (default)
202 default in PIC mode
203
204 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
205 Format: <irq>,<irq>...
206
207 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
208 use by PCI
209 Format: <irq>,<irq>...
210
211 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
212
213 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
214 Format: To spoof as Windows 98: ="Microsoft Windows"
215
216 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
217 acpi_osi="string1" # add string1 -- only one string
218 acpi_osi="!string2" # remove built-in string2
219 acpi_osi= # disable all strings
220
221 acpi_pm_good [X86-32,X86-64]
222 Override the pmtimer bug detection: force the kernel
223 to assume that this machine's pmtimer latches its value
224 and always returns good values.
225
249 acpi.power_nocheck= [HW,ACPI] 226 acpi.power_nocheck= [HW,ACPI]
250 Format: 1/0 enable/disable the check of power state. 227 Format: 1/0 enable/disable the check of power state.
251 On some bogus BIOS the _PSC object/_STA object of 228 On some bogus BIOS the _PSC object/_STA object of
@@ -254,26 +231,21 @@ and is between 256 and 4096 characters. It is defined in the file
254 power state again in power transition. 231 power state again in power transition.
255 1 : disable the power state check 232 1 : disable the power state check
256 233
257 acpi_pm_good [X86-32,X86-64] 234 acpi_enforce_resources= [ACPI]
258 Override the pmtimer bug detection: force the kernel 235 { strict | lax | no }
259 to assume that this machine's pmtimer latches its value 236 Check for resource conflicts between native drivers
260 and always returns good values. 237 and ACPI OperationRegions (SystemIO and SystemMemory
261 238 only). IO ports and memory declared in ACPI might be
262 agp= [AGP] 239 used by the ACPI subsystem in arbitrary AML code and
263 { off | try_unsupported } 240 can interfere with legacy drivers.
264 off: disable AGP support 241 strict (default): access to resources claimed by ACPI
265 try_unsupported: try to drive unsupported chipsets 242 is denied; legacy drivers trying to access reserved
266 (may crash computer or cause data corruption) 243 resources will fail to bind to device using them.
267 244 lax: access to resources claimed by ACPI is allowed;
268 enable_timer_pin_1 [i386,x86-64] 245 legacy drivers trying to access reserved resources
269 Enable PIN 1 of APIC timer 246 will bind successfully but a warning message is logged.
270 Can be useful to work around chipset bugs 247 no: ACPI OperationRegions are not marked as reserved,
271 (in particular on some ATI chipsets). 248 no further checks are performed.
272 The kernel tries to set a reasonable default.
273
274 disable_timer_pin_1 [i386,x86-64]
275 Disable PIN 1 of APIC timer
276 Can be useful to work around chipset bugs.
277 249
278 ad1848= [HW,OSS] 250 ad1848= [HW,OSS]
279 Format: <io>,<irq>,<dma>,<dma2>,<type> 251 Format: <io>,<irq>,<dma>,<dma2>,<type>
@@ -288,6 +260,12 @@ and is between 256 and 4096 characters. It is defined in the file
288 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 260 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
289 See also header of sound/oss/aedsp16.c. 261 See also header of sound/oss/aedsp16.c.
290 262
263 agp= [AGP]
264 { off | try_unsupported }
265 off: disable AGP support
266 try_unsupported: try to drive unsupported chipsets
267 (may crash computer or cause data corruption)
268
291 aha152x= [HW,SCSI] 269 aha152x= [HW,SCSI]
292 See Documentation/scsi/aha152x.txt. 270 See Documentation/scsi/aha152x.txt.
293 271
@@ -415,12 +393,6 @@ and is between 256 and 4096 characters. It is defined in the file
415 possible to determine what the correct size should be. 393 possible to determine what the correct size should be.
416 This option provides an override for these situations. 394 This option provides an override for these situations.
417 395
418 security= [SECURITY] Choose a security module to enable at boot.
419 If this boot parameter is not specified, only the first
420 security module asking for security registration will be
421 loaded. An invalid security module name will be treated
422 as if no module has been chosen.
423
424 capability.disable= 396 capability.disable=
425 [SECURITY] Disable capabilities. This would normally 397 [SECURITY] Disable capabilities. This would normally
426 be used only if an alternative security model is to be 398 be used only if an alternative security model is to be
@@ -492,14 +464,6 @@ and is between 256 and 4096 characters. It is defined in the file
492 Range: 0 - 8192 464 Range: 0 - 8192
493 Default: 64 465 Default: 64
494 466
495 hpet= [X86-32,HPET] option to control HPET usage
496 Format: { enable (default) | disable | force |
497 verbose }
498 disable: disable HPET and use PIT instead
499 force: allow force enabled of undocumented chips (ICH4,
500 VIA, nVidia)
501 verbose: show contents of HPET registers during setup
502
503 com20020= [HW,NET] ARCnet - COM20020 chipset 467 com20020= [HW,NET] ARCnet - COM20020 chipset
504 Format: 468 Format:
505 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]] 469 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -543,23 +507,6 @@ and is between 256 and 4096 characters. It is defined in the file
543 console=brl,ttyS0 507 console=brl,ttyS0
544 For now, only VisioBraille is supported. 508 For now, only VisioBraille is supported.
545 509
546 earlycon= [KNL] Output early console device and options.
547 uart[8250],io,<addr>[,options]
548 uart[8250],mmio,<addr>[,options]
549 Start an early, polled-mode console on the 8250/16550
550 UART at the specified I/O port or MMIO address.
551 The options are the same as for ttyS, above.
552
553 no_console_suspend
554 [HW] Never suspend the console
555 Disable suspending of consoles during suspend and
556 hibernate operations. Once disabled, debugging
557 messages can reach various consoles while the rest
558 of the system is being put to sleep (ie, while
559 debugging driver suspend/resume hooks). This may
560 not work reliably with all consoles, but is known
561 to work with serial and VGA consoles.
562
563 coredump_filter= 510 coredump_filter=
564 [KNL] Change the default value for 511 [KNL] Change the default value for
565 /proc/<pid>/coredump_filter. 512 /proc/<pid>/coredump_filter.
@@ -607,36 +554,22 @@ and is between 256 and 4096 characters. It is defined in the file
607 554
608 debug_objects [KNL] Enable object debugging 555 debug_objects [KNL] Enable object debugging
609 556
557 no_debug_objects
558 [KNL] Disable object debugging
559
610 debugpat [X86] Enable PAT debugging 560 debugpat [X86] Enable PAT debugging
611 561
612 decnet.addr= [HW,NET] 562 decnet.addr= [HW,NET]
613 Format: <area>[,<node>] 563 Format: <area>[,<node>]
614 See also Documentation/networking/decnet.txt. 564 See also Documentation/networking/decnet.txt.
615 565
616 vt.default_blu= [VT] 566 default_hugepagesz=
617 Format: <blue0>,<blue1>,<blue2>,...,<blue15> 567 [same as hugepagesz=] The size of the default
618 Change the default blue palette of the console. 568 HugeTLB page size. This is the size represented by
619 This is a 16-member array composed of values 569 the legacy /proc/ hugepages APIs, used for SHM, and
620 ranging from 0-255. 570 default size when mounting hugetlbfs filesystems.
621 571 Defaults to the default architecture's huge page size
622 vt.default_grn= [VT] 572 if not specified.
623 Format: <green0>,<green1>,<green2>,...,<green15>
624 Change the default green palette of the console.
625 This is a 16-member array composed of values
626 ranging from 0-255.
627
628 vt.default_red= [VT]
629 Format: <red0>,<red1>,<red2>,...,<red15>
630 Change the default red palette of the console.
631 This is a 16-member array composed of values
632 ranging from 0-255.
633
634 vt.default_utf8=
635 [VT]
636 Format=<0|1>
637 Set system-wide default UTF-8 mode for all tty's.
638 Default is 1, i.e. UTF-8 mode is enabled for all
639 newly opened terminals.
640 573
641 dhash_entries= [KNL] 574 dhash_entries= [KNL]
642 Set number of hash buckets for dentry cache. 575 Set number of hash buckets for dentry cache.
@@ -649,27 +582,9 @@ and is between 256 and 4096 characters. It is defined in the file
649 Documentation/serial/digiepca.txt. 582 Documentation/serial/digiepca.txt.
650 583
651 disable_mtrr_cleanup [X86] 584 disable_mtrr_cleanup [X86]
652 enable_mtrr_cleanup [X86]
653 The kernel tries to adjust MTRR layout from continuous 585 The kernel tries to adjust MTRR layout from continuous
654 to discrete, to make X server driver able to add WB 586 to discrete, to make X server driver able to add WB
655 entry later. This parameter enables/disables that. 587 entry later. This parameter disables that.
656
657 mtrr_chunk_size=nn[KMG] [X86]
658 used for mtrr cleanup. It is largest continous chunk
659 that could hold holes aka. UC entries.
660
661 mtrr_gran_size=nn[KMG] [X86]
662 Used for mtrr cleanup. It is granularity of mtrr block.
663 Default is 1.
664 Large value could prevent small alignment from
665 using up MTRRs.
666
667 mtrr_spare_reg_nr=n [X86]
668 Format: <integer>
669 Range: 0,7 : spare reg number
670 Default : 1
671 Used for mtrr cleanup. It is spare mtrr entries number.
672 Set to 2 or more if your graphical card needs more.
673 588
674 disable_mtrr_trim [X86, Intel and AMD only] 589 disable_mtrr_trim [X86, Intel and AMD only]
675 By default the kernel will trim any uncacheable 590 By default the kernel will trim any uncacheable
@@ -677,12 +592,38 @@ and is between 256 and 4096 characters. It is defined in the file
677 MTRR settings. This parameter disables that behavior, 592 MTRR settings. This parameter disables that behavior,
678 possibly causing your machine to run very slowly. 593 possibly causing your machine to run very slowly.
679 594
595 disable_timer_pin_1 [i386,x86-64]
596 Disable PIN 1 of APIC timer
597 Can be useful to work around chipset bugs.
598
680 dmasound= [HW,OSS] Sound subsystem buffers 599 dmasound= [HW,OSS] Sound subsystem buffers
681 600
601 dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
602 this option disables the debugging code at boot.
603
604 dma_debug_entries=<number>
605 This option allows to tune the number of preallocated
606 entries for DMA-API debugging code. One entry is
607 required per DMA-API allocation. Use this if the
608 DMA-API debugging code disables itself because the
609 architectural default is too low.
610
682 dscc4.setup= [NET] 611 dscc4.setup= [NET]
683 612
684 dtc3181e= [HW,SCSI] 613 dtc3181e= [HW,SCSI]
685 614
615 dynamic_printk Enables pr_debug()/dev_dbg() calls if
616 CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
617 These can also be switched on/off via
618 <debugfs>/dynamic_printk/modules
619
620 earlycon= [KNL] Output early console device and options.
621 uart[8250],io,<addr>[,options]
622 uart[8250],mmio,<addr>[,options]
623 Start an early, polled-mode console on the 8250/16550
624 UART at the specified I/O port or MMIO address.
625 The options are the same as for ttyS, above.
626
686 earlyprintk= [X86-32,X86-64,SH,BLACKFIN] 627 earlyprintk= [X86-32,X86-64,SH,BLACKFIN]
687 earlyprintk=vga 628 earlyprintk=vga
688 earlyprintk=serial[,ttySn[,baudrate]] 629 earlyprintk=serial[,ttySn[,baudrate]]
@@ -724,6 +665,17 @@ and is between 256 and 4096 characters. It is defined in the file
724 pass this option to capture kernel. 665 pass this option to capture kernel.
725 See Documentation/kdump/kdump.txt for details. 666 See Documentation/kdump/kdump.txt for details.
726 667
668 enable_mtrr_cleanup [X86]
669 The kernel tries to adjust MTRR layout from continuous
670 to discrete, to make X server driver able to add WB
671 entry later. This parameter enables that.
672
673 enable_timer_pin_1 [i386,x86-64]
674 Enable PIN 1 of APIC timer
675 Can be useful to work around chipset bugs
676 (in particular on some ATI chipsets).
677 The kernel tries to set a reasonable default.
678
727 enforcing [SELINUX] Set initial enforcing status. 679 enforcing [SELINUX] Set initial enforcing status.
728 Format: {"0" | "1"} 680 Format: {"0" | "1"}
729 See security/selinux/Kconfig help text. 681 See security/selinux/Kconfig help text.
@@ -811,6 +763,16 @@ and is between 256 and 4096 characters. It is defined in the file
811 hisax= [HW,ISDN] 763 hisax= [HW,ISDN]
812 See Documentation/isdn/README.HiSax. 764 See Documentation/isdn/README.HiSax.
813 765
766 hlt [BUGS=ARM,SH]
767
768 hpet= [X86-32,HPET] option to control HPET usage
769 Format: { enable (default) | disable | force |
770 verbose }
771 disable: disable HPET and use PIT instead
772 force: allow force enabled of undocumented chips (ICH4,
773 VIA, nVidia)
774 verbose: show contents of HPET registers during setup
775
814 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. 776 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
815 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. 777 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
816 On x86-64 and powerpc, this option can be specified 778 On x86-64 and powerpc, this option can be specified
@@ -820,15 +782,6 @@ and is between 256 and 4096 characters. It is defined in the file
820 (when the CPU supports the "pdpe1gb" cpuinfo flag) 782 (when the CPU supports the "pdpe1gb" cpuinfo flag)
821 Note that 1GB pages can only be allocated at boot time 783 Note that 1GB pages can only be allocated at boot time
822 using hugepages= and not freed afterwards. 784 using hugepages= and not freed afterwards.
823 default_hugepagesz=
824 [same as hugepagesz=] The size of the default
825 HugeTLB page size. This is the size represented by
826 the legacy /proc/ hugepages APIs, used for SHM, and
827 default size when mounting hugetlbfs filesystems.
828 Defaults to the default architecture's huge page size
829 if not specified.
830
831 hlt [BUGS=ARM,SH]
832 785
833 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) 786 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
834 terminal devices. Valid values: 0..8 787 terminal devices. Valid values: 0..8
@@ -889,6 +842,9 @@ and is between 256 and 4096 characters. It is defined in the file
889 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed 842 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
890 See Documentation/ide/ide.txt. 843 See Documentation/ide/ide.txt.
891 844
845 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
846 Claim all unknown PCI IDE storage controllers.
847
892 idle= [X86] 848 idle= [X86]
893 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait 849 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait
894 Poll forces a polling idle loop that can slightly 850 Poll forces a polling idle loop that can slightly
@@ -904,9 +860,6 @@ and is between 256 and 4096 characters. It is defined in the file
904 In such case C2/C3 won't be used again. 860 In such case C2/C3 won't be used again.
905 idle=nomwait: Disable mwait for CPU C-states 861 idle=nomwait: Disable mwait for CPU C-states
906 862
907 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
908 Claim all unknown PCI IDE storage controllers.
909
910 ignore_loglevel [KNL] 863 ignore_loglevel [KNL]
911 Ignore loglevel setting - this will print /all/ 864 Ignore loglevel setting - this will print /all/
912 kernel messages to the console. Useful for debugging. 865 kernel messages to the console. Useful for debugging.
@@ -940,25 +893,6 @@ and is between 256 and 4096 characters. It is defined in the file
940 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver 893 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
941 Format: <irq> 894 Format: <irq>
942 895
943 inttest= [IA64]
944
945 iomem= Disable strict checking of access to MMIO memory
946 strict regions from userspace.
947 relaxed
948
949 iommu= [x86]
950 off
951 force
952 noforce
953 biomerge
954 panic
955 nopanic
956 merge
957 nomerge
958 forcesac
959 soft
960
961
962 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option 896 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
963 on 897 on
964 Enable intel iommu driver. 898 Enable intel iommu driver.
@@ -982,6 +916,28 @@ and is between 256 and 4096 characters. It is defined in the file
982 result in a hardware IOTLB flush operation as opposed 916 result in a hardware IOTLB flush operation as opposed
983 to batching them for performance. 917 to batching them for performance.
984 918
919 inttest= [IA64]
920
921 iomem= Disable strict checking of access to MMIO memory
922 strict regions from userspace.
923 relaxed
924
925 iommu= [x86]
926 off
927 force
928 noforce
929 biomerge
930 panic
931 nopanic
932 merge
933 nomerge
934 forcesac
935 soft
936
937 io7= [HW] IO7 for Marvel based alpha systems
938 See comment before marvel_specify_io7 in
939 arch/alpha/kernel/core_marvel.c.
940
985 io_delay= [X86-32,X86-64] I/O delay method 941 io_delay= [X86-32,X86-64] I/O delay method
986 0x80 942 0x80
987 Standard port 0x80 based delay 943 Standard port 0x80 based delay
@@ -992,10 +948,6 @@ and is between 256 and 4096 characters. It is defined in the file
992 none 948 none
993 No delay 949 No delay
994 950
995 io7= [HW] IO7 for Marvel based alpha systems
996 See comment before marvel_specify_io7 in
997 arch/alpha/kernel/core_marvel.c.
998
999 ip= [IP_PNP] 951 ip= [IP_PNP]
1000 See Documentation/filesystems/nfsroot.txt. 952 See Documentation/filesystems/nfsroot.txt.
1001 953
@@ -1006,12 +958,6 @@ and is between 256 and 4096 characters. It is defined in the file
1006 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller 958 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
1007 See header of drivers/scsi/ips.c. 959 See header of drivers/scsi/ips.c.
1008 960
1009 ports= [IP_VS_FTP] IPVS ftp helper module
1010 Default is 21.
1011 Up to 8 (IP_VS_APP_MAX_PORTS) ports
1012 may be specified.
1013 Format: <port>,<port>....
1014
1015 irqfixup [HW] 961 irqfixup [HW]
1016 When an interrupt is not handled search all handlers 962 When an interrupt is not handled search all handlers
1017 for it. Intended to get systems with badly broken 963 for it. Intended to get systems with badly broken
@@ -1052,6 +998,8 @@ and is between 256 and 4096 characters. It is defined in the file
1052 js= [HW,JOY] Analog joystick 998 js= [HW,JOY] Analog joystick
1053 See Documentation/input/joystick.txt. 999 See Documentation/input/joystick.txt.
1054 1000
1001 keepinitrd [HW,ARM]
1002
1055 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1003 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1056 specifies the amount of memory usable by the kernel 1004 specifies the amount of memory usable by the kernel
1057 for non-movable allocations. The requested amount is 1005 for non-movable allocations. The requested amount is
@@ -1068,20 +1016,14 @@ and is between 256 and 4096 characters. It is defined in the file
1068 use the HighMem zone if it exists, and the Normal 1016 use the HighMem zone if it exists, and the Normal
1069 zone if it does not. 1017 zone if it does not.
1070 1018
1071 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1019 kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
1072 is similar to kernelcore except it specifies the 1020 Controls whether kmemtrace is enabled
1073 amount of memory used for migratable allocations. 1021 at boot-time.
1074 If both kernelcore and movablecore is specified,
1075 then kernelcore will be at *least* the specified
1076 value but may be more. If movablecore on its own
1077 is specified, the administrator must be careful
1078 that the amount of memory usable for all allocations
1079 is not too small.
1080
1081 keepinitrd [HW,ARM]
1082 1022
1083 kstack=N [X86-32,X86-64] Print N words from the kernel stack 1023 kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
1084 in oops dumps. 1024 subbufs kmemtrace's relay channel has. Set this
1025 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1026 you experience buffer overruns.
1085 1027
1086 kgdboc= [HW] kgdb over consoles. 1028 kgdboc= [HW] kgdb over consoles.
1087 Requires a tty driver that supports console polling. 1029 Requires a tty driver that supports console polling.
@@ -1092,6 +1034,9 @@ and is between 256 and 4096 characters. It is defined in the file
1092 Configure the RouterBoard 532 series on-chip 1034 Configure the RouterBoard 532 series on-chip
1093 Ethernet adapter MAC address. 1035 Ethernet adapter MAC address.
1094 1036
1037 kstack=N [X86-32,X86-64] Print N words from the kernel stack
1038 in oops dumps.
1039
1095 l2cr= [PPC] 1040 l2cr= [PPC]
1096 1041
1097 l3cr= [PPC] 1042 l3cr= [PPC]
@@ -1237,9 +1182,8 @@ and is between 256 and 4096 characters. It is defined in the file
1237 (machvec) in a generic kernel. 1182 (machvec) in a generic kernel.
1238 Example: machvec=hpzx1_swiotlb 1183 Example: machvec=hpzx1_swiotlb
1239 1184
1240 max_loop= [LOOP] Maximum number of loopback devices that can 1185 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
1241 be mounted 1186 than or equal to this physical address is ignored.
1242 Format: <1-256>
1243 1187
1244 maxcpus= [SMP] Maximum number of processors that an SMP kernel 1188 maxcpus= [SMP] Maximum number of processors that an SMP kernel
1245 should make use of. maxcpus=n : n >= 0 limits the 1189 should make use of. maxcpus=n : n >= 0 limits the
@@ -1247,8 +1191,9 @@ and is between 256 and 4096 characters. It is defined in the file
1247 it is equivalent to "nosmp", which also disables 1191 it is equivalent to "nosmp", which also disables
1248 the IO APIC. 1192 the IO APIC.
1249 1193
1250 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than 1194 max_loop= [LOOP] Maximum number of loopback devices that can
1251 or equal to this physical address is ignored. 1195 be mounted
1196 Format: <1-256>
1252 1197
1253 max_luns= [SCSI] Maximum number of LUNs to probe. 1198 max_luns= [SCSI] Maximum number of LUNs to probe.
1254 Should be between 1 and 2^32-1. 1199 Should be between 1 and 2^32-1.
@@ -1375,6 +1320,16 @@ and is between 256 and 4096 characters. It is defined in the file
1375 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices 1320 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
1376 reporting absolute coordinates, such as tablets 1321 reporting absolute coordinates, such as tablets
1377 1322
1323 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1324 is similar to kernelcore except it specifies the
1325 amount of memory used for migratable allocations.
1326 If both kernelcore and movablecore is specified,
1327 then kernelcore will be at *least* the specified
1328 value but may be more. If movablecore on its own
1329 is specified, the administrator must be careful
1330 that the amount of memory usable for all allocations
1331 is not too small.
1332
1378 mpu401= [HW,OSS] 1333 mpu401= [HW,OSS]
1379 Format: <io>,<irq> 1334 Format: <io>,<irq>
1380 1335
@@ -1396,6 +1351,23 @@ and is between 256 and 4096 characters. It is defined in the file
1396 [HW] Make the MicroTouch USB driver use raw coordinates 1351 [HW] Make the MicroTouch USB driver use raw coordinates
1397 ('y', default) or cooked coordinates ('n') 1352 ('y', default) or cooked coordinates ('n')
1398 1353
1354 mtrr_chunk_size=nn[KMG] [X86]
1355 used for mtrr cleanup. It is largest continous chunk
1356 that could hold holes aka. UC entries.
1357
1358 mtrr_gran_size=nn[KMG] [X86]
1359 Used for mtrr cleanup. It is granularity of mtrr block.
1360 Default is 1.
1361 Large value could prevent small alignment from
1362 using up MTRRs.
1363
1364 mtrr_spare_reg_nr=n [X86]
1365 Format: <integer>
1366 Range: 0,7 : spare reg number
1367 Default : 1
1368 Used for mtrr cleanup. It is spare mtrr entries number.
1369 Set to 2 or more if your graphical card needs more.
1370
1399 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card 1371 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
1400 1372
1401 NCR_D700= [HW,SCSI] 1373 NCR_D700= [HW,SCSI]
@@ -1456,11 +1428,13 @@ and is between 256 and 4096 characters. It is defined in the file
1456 0 - turn nmi_watchdog off 1428 0 - turn nmi_watchdog off
1457 1 - use the IO-APIC timer for the NMI watchdog 1429 1 - use the IO-APIC timer for the NMI watchdog
1458 2 - use the local APIC for the NMI watchdog using 1430 2 - use the local APIC for the NMI watchdog using
1459 a performance counter. Note: This will use one performance 1431 a performance counter. Note: This will use one
1460 counter and the local APIC's performance vector. 1432 performance counter and the local APIC's performance
1461 When panic is specified panic when an NMI watchdog timeout occurs. 1433 vector.
1462 This is useful when you use a panic=... timeout and need the box 1434 When panic is specified, panic when an NMI watchdog
1463 quickly up again. 1435 timeout occurs.
1436 This is useful when you use a panic=... timeout and
1437 need the box quickly up again.
1464 Instead of 1 and 2 it is possible to use the following 1438 Instead of 1 and 2 it is possible to use the following
1465 symbolic names: lapic and ioapic 1439 symbolic names: lapic and ioapic
1466 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic 1440 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
@@ -1469,6 +1443,16 @@ and is between 256 and 4096 characters. It is defined in the file
1469 emulation library even if a 387 maths coprocessor 1443 emulation library even if a 387 maths coprocessor
1470 is present. 1444 is present.
1471 1445
1446 no_console_suspend
1447 [HW] Never suspend the console
1448 Disable suspending of consoles during suspend and
1449 hibernate operations. Once disabled, debugging
1450 messages can reach various consoles while the rest
1451 of the system is being put to sleep (ie, while
1452 debugging driver suspend/resume hooks). This may
1453 not work reliably with all consoles, but is known
1454 to work with serial and VGA consoles.
1455
1472 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien 1456 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
1473 caches in the slab allocator. Saves per-node memory, 1457 caches in the slab allocator. Saves per-node memory,
1474 but will impact performance. 1458 but will impact performance.
@@ -1483,6 +1467,8 @@ and is between 256 and 4096 characters. It is defined in the file
1483 1467
1484 nocache [ARM] 1468 nocache [ARM]
1485 1469
1470 noclflush [BUGS=X86] Don't use the CLFLUSH instruction
1471
1486 nodelayacct [KNL] Disable per-task delay accounting 1472 nodelayacct [KNL] Disable per-task delay accounting
1487 1473
1488 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. 1474 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
@@ -1511,9 +1497,9 @@ and is between 256 and 4096 characters. It is defined in the file
1511 register save and restore. The kernel will only save 1497 register save and restore. The kernel will only save
1512 legacy floating-point registers on task switch. 1498 legacy floating-point registers on task switch.
1513 1499
1514 noclflush [BUGS=X86] Don't use the CLFLUSH instruction 1500 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
1515 1501 wfi(ARM) instruction doesn't work correctly and not to
1516 nohlt [BUGS=ARM,SH] 1502 use it. This is also useful when using JTAG debugger.
1517 1503
1518 no-hlt [BUGS=X86-32] Tells the kernel that the hlt 1504 no-hlt [BUGS=X86-32] Tells the kernel that the hlt
1519 instruction doesn't work correctly and not to 1505 instruction doesn't work correctly and not to
@@ -1534,6 +1520,8 @@ and is between 256 and 4096 characters. It is defined in the file
1534 Valid arguments: on, off 1520 Valid arguments: on, off
1535 Default: on 1521 Default: on
1536 1522
1523 noiotrap [SH] Disables trapped I/O port accesses.
1524
1537 noirqdebug [X86-32] Disables the code which attempts to detect and 1525 noirqdebug [X86-32] Disables the code which attempts to detect and
1538 disable unhandled interrupt sources. 1526 disable unhandled interrupt sources.
1539 1527
@@ -1553,12 +1541,6 @@ and is between 256 and 4096 characters. It is defined in the file
1553 1541
1554 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1542 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
1555 1543
1556 nox2apic [X86-64,APIC] Do not enable x2APIC mode.
1557
1558 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
1559 default x2apic cluster mode on platforms
1560 supporting x2apic.
1561
1562 noltlbs [PPC] Do not use large page/tlb entries for kernel 1544 noltlbs [PPC] Do not use large page/tlb entries for kernel
1563 lowmem mapping on PPC40x. 1545 lowmem mapping on PPC40x.
1564 1546
@@ -1569,6 +1551,9 @@ and is between 256 and 4096 characters. It is defined in the file
1569 nomfgpt [X86-32] Disable Multi-Function General Purpose 1551 nomfgpt [X86-32] Disable Multi-Function General Purpose
1570 Timer usage (for AMD Geode machines). 1552 Timer usage (for AMD Geode machines).
1571 1553
1554 norandmaps Don't use address space randomization. Equivalent to
1555 echo 0 > /proc/sys/kernel/randomize_va_space
1556
1572 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops 1557 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
1573 1558
1574 noreplace-smp [X86-32,SMP] Don't replace SMP instructions 1559 noreplace-smp [X86-32,SMP] Don't replace SMP instructions
@@ -1593,7 +1578,7 @@ and is between 256 and 4096 characters. It is defined in the file
1593 nosoftlockup [KNL] Disable the soft-lockup detector. 1578 nosoftlockup [KNL] Disable the soft-lockup detector.
1594 1579
1595 noswapaccount [KNL] Disable accounting of swap in memory resource 1580 noswapaccount [KNL] Disable accounting of swap in memory resource
1596 controller. (See Documentation/controllers/memory.txt) 1581 controller. (See Documentation/cgroups/memory.txt)
1597 1582
1598 nosync [HW,M68K] Disables sync negotiation for all devices. 1583 nosync [HW,M68K] Disables sync negotiation for all devices.
1599 1584
@@ -1607,13 +1592,13 @@ and is between 256 and 4096 characters. It is defined in the file
1607 purges which is reported from either PAL_VM_SUMMARY or 1592 purges which is reported from either PAL_VM_SUMMARY or
1608 SAL PALO. 1593 SAL PALO.
1609 1594
1595 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1596
1610 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 1597 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
1611 one of ['zone', 'node', 'default'] can be specified 1598 one of ['zone', 'node', 'default'] can be specified
1612 This can be set from sysctl after boot. 1599 This can be set from sysctl after boot.
1613 See Documentation/sysctl/vm.txt for details. 1600 See Documentation/sysctl/vm.txt for details.
1614 1601
1615 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1616
1617 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. 1602 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
1618 See Documentation/debugging-via-ohci1394.txt for more 1603 See Documentation/debugging-via-ohci1394.txt for more
1619 info. 1604 info.
@@ -1685,6 +1670,8 @@ and is between 256 and 4096 characters. It is defined in the file
1685 See also Documentation/blockdev/paride.txt. 1670 See also Documentation/blockdev/paride.txt.
1686 1671
1687 pci=option[,option...] [PCI] various PCI subsystem options: 1672 pci=option[,option...] [PCI] various PCI subsystem options:
1673 earlydump [X86] dump PCI config space before the kernel
1674 changes anything
1688 off [X86] don't probe for the PCI bus 1675 off [X86] don't probe for the PCI bus
1689 bios [X86-32] force use of PCI BIOS, don't access 1676 bios [X86-32] force use of PCI BIOS, don't access
1690 the hardware directly. Use this if your machine 1677 the hardware directly. Use this if your machine
@@ -1784,6 +1771,15 @@ and is between 256 and 4096 characters. It is defined in the file
1784 cbmemsize=nn[KMG] The fixed amount of bus space which is 1771 cbmemsize=nn[KMG] The fixed amount of bus space which is
1785 reserved for the CardBus bridge's memory 1772 reserved for the CardBus bridge's memory
1786 window. The default value is 64 megabytes. 1773 window. The default value is 64 megabytes.
1774 resource_alignment=
1775 Format:
1776 [<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
1777 Specifies alignment and device to reassign
1778 aligned memory resources.
1779 If <order of align> is not specified,
1780 PAGE_SIZE is used as alignment.
1781 PCI-PCI bridge can be specified, if resource
1782 windows need to be expanded.
1787 1783
1788 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power 1784 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
1789 Management. 1785 Management.
@@ -1851,6 +1847,14 @@ and is between 256 and 4096 characters. It is defined in the file
1851 printk.time= Show timing data prefixed to each printk message line 1847 printk.time= Show timing data prefixed to each printk message line
1852 Format: <bool> (1/Y/y=enable, 0/N/n=disable) 1848 Format: <bool> (1/Y/y=enable, 0/N/n=disable)
1853 1849
1850 processor.max_cstate= [HW,ACPI]
1851 Limit processor to maximum C-state
1852 max_cstate=9 overrides any DMI blacklist limit.
1853
1854 processor.nocst [HW,ACPI]
1855 Ignore the _CST method to determine C-states,
1856 instead using the legacy FADT method
1857
1854 profile= [KNL] Enable kernel profiling via /proc/profile 1858 profile= [KNL] Enable kernel profiling via /proc/profile
1855 Format: [schedule,]<number> 1859 Format: [schedule,]<number>
1856 Param: "schedule" - profile schedule points. 1860 Param: "schedule" - profile schedule points.
@@ -1860,14 +1864,6 @@ and is between 256 and 4096 characters. It is defined in the file
1860 Requires CONFIG_SCHEDSTATS 1864 Requires CONFIG_SCHEDSTATS
1861 Param: "kvm" - profile VM exits. 1865 Param: "kvm" - profile VM exits.
1862 1866
1863 processor.max_cstate= [HW,ACPI]
1864 Limit processor to maximum C-state
1865 max_cstate=9 overrides any DMI blacklist limit.
1866
1867 processor.nocst [HW,ACPI]
1868 Ignore the _CST method to determine C-states,
1869 instead using the legacy FADT method
1870
1871 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk 1867 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
1872 before loading. 1868 before loading.
1873 See Documentation/blockdev/ramdisk.txt. 1869 See Documentation/blockdev/ramdisk.txt.
@@ -1932,7 +1928,7 @@ and is between 256 and 4096 characters. It is defined in the file
1932 1928
1933 relax_domain_level= 1929 relax_domain_level=
1934 [KNL, SMP] Set scheduler's default relax_domain_level. 1930 [KNL, SMP] Set scheduler's default relax_domain_level.
1935 See Documentation/cpusets.txt. 1931 See Documentation/cgroups/cpusets.txt.
1936 1932
1937 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 1933 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
1938 1934
@@ -2021,7 +2017,13 @@ and is between 256 and 4096 characters. It is defined in the file
2021 allowing boot to proceed. none ignores them, expecting 2017 allowing boot to proceed. none ignores them, expecting
2022 user space to do the scan. 2018 user space to do the scan.
2023 2019
2024 selinux [SELINUX] Disable or enable SELinux at boot time. 2020 security= [SECURITY] Choose a security module to enable at boot.
2021 If this boot parameter is not specified, only the first
2022 security module asking for security registration will be
2023 loaded. An invalid security module name will be treated
2024 as if no module has been chosen.
2025
2026 selinux= [SELINUX] Disable or enable SELinux at boot time.
2025 Format: { "0" | "1" } 2027 Format: { "0" | "1" }
2026 See security/selinux/Kconfig help text. 2028 See security/selinux/Kconfig help text.
2027 0 -- disable. 2029 0 -- disable.
@@ -2030,15 +2032,6 @@ and is between 256 and 4096 characters. It is defined in the file
2030 If enabled at boot time, /selinux/disable can be used 2032 If enabled at boot time, /selinux/disable can be used
2031 later to disable prior to initial policy load. 2033 later to disable prior to initial policy load.
2032 2034
2033 selinux_compat_net =
2034 [SELINUX] Set initial selinux_compat_net flag value.
2035 Format: { "0" | "1" }
2036 0 -- use new secmark-based packet controls
2037 1 -- use legacy packet controls
2038 Default value is 0 (preferred).
2039 Value can be changed at runtime via
2040 /selinux/compat_net.
2041
2042 serialnumber [BUGS=X86-32] 2035 serialnumber [BUGS=X86-32]
2043 2036
2044 shapers= [NET] 2037 shapers= [NET]
@@ -2350,6 +2343,8 @@ and is between 256 and 4096 characters. It is defined in the file
2350 2343
2351 tp720= [HW,PS2] 2344 tp720= [HW,PS2]
2352 2345
2346 trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size.
2347
2353 trix= [HW,OSS] MediaTrix AudioTrix Pro 2348 trix= [HW,OSS] MediaTrix AudioTrix Pro
2354 Format: 2349 Format:
2355 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2350 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
@@ -2452,9 +2447,6 @@ and is between 256 and 4096 characters. It is defined in the file
2452 medium is write-protected). 2447 medium is write-protected).
2453 Example: quirks=0419:aaf5:rl,0421:0433:rc 2448 Example: quirks=0419:aaf5:rl,0421:0433:rc
2454 2449
2455 add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in
2456 kernel's map of available physical RAM.
2457
2458 vdso= [X86-32,SH,x86-64] 2450 vdso= [X86-32,SH,x86-64]
2459 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2451 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2460 vdso=1: enable VDSO (default) 2452 vdso=1: enable VDSO (default)
@@ -2493,6 +2485,31 @@ and is between 256 and 4096 characters. It is defined in the file
2493 vmpoff= [KNL,S390] Perform z/VM CP command after power off. 2485 vmpoff= [KNL,S390] Perform z/VM CP command after power off.
2494 Format: <command> 2486 Format: <command>
2495 2487
2488 vt.default_blu= [VT]
2489 Format: <blue0>,<blue1>,<blue2>,...,<blue15>
2490 Change the default blue palette of the console.
2491 This is a 16-member array composed of values
2492 ranging from 0-255.
2493
2494 vt.default_grn= [VT]
2495 Format: <green0>,<green1>,<green2>,...,<green15>
2496 Change the default green palette of the console.
2497 This is a 16-member array composed of values
2498 ranging from 0-255.
2499
2500 vt.default_red= [VT]
2501 Format: <red0>,<red1>,<red2>,...,<red15>
2502 Change the default red palette of the console.
2503 This is a 16-member array composed of values
2504 ranging from 0-255.
2505
2506 vt.default_utf8=
2507 [VT]
2508 Format=<0|1>
2509 Set system-wide default UTF-8 mode for all tty's.
2510 Default is 1, i.e. UTF-8 mode is enabled for all
2511 newly opened terminals.
2512
2496 waveartist= [HW,OSS] 2513 waveartist= [HW,OSS]
2497 Format: <io>,<irq>,<dma>,<dma2> 2514 Format: <io>,<irq>,<dma>,<dma2>
2498 2515
@@ -2505,6 +2522,10 @@ and is between 256 and 4096 characters. It is defined in the file
2505 wdt= [WDT] Watchdog 2522 wdt= [WDT] Watchdog
2506 See Documentation/watchdog/wdt.txt. 2523 See Documentation/watchdog/wdt.txt.
2507 2524
2525 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2526 default x2apic cluster mode on platforms
2527 supporting x2apic.
2528
2508 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. 2529 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
2509 xd_geo= See header of drivers/block/xd.c. 2530 xd_geo= See header of drivers/block/xd.c.
2510 2531
@@ -2512,9 +2533,6 @@ and is between 256 and 4096 characters. It is defined in the file
2512 Format: 2533 Format:
2513 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] 2534 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
2514 2535
2515 norandmaps Don't use address space randomization. Equivalent to
2516 echo 0 > /proc/sys/kernel/randomize_va_space
2517
2518______________________________________________________________________ 2536______________________________________________________________________
2519 2537
2520TODO: 2538TODO:
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
index 2b3a6b5260bf..5ee2a02b3b40 100644
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -1,9 +1,9 @@
1Acer Laptop WMI Extras Driver 1Acer Laptop WMI Extras Driver
2http://code.google.com/p/aceracpi 2http://code.google.com/p/aceracpi
3Version 0.2 3Version 0.3
418th August 2008 44th April 2009
5 5
6Copyright 2007-2008 Carlos Corbacho <carlos@strangeworlds.co.uk> 6Copyright 2007-2009 Carlos Corbacho <carlos@strangeworlds.co.uk>
7 7
8acer-wmi is a driver to allow you to control various parts of your Acer laptop 8acer-wmi is a driver to allow you to control various parts of your Acer laptop
9hardware under Linux which are exposed via ACPI-WMI. 9hardware under Linux which are exposed via ACPI-WMI.
@@ -36,6 +36,10 @@ not possible in kernel space from a 64 bit OS.
36Supported Hardware 36Supported Hardware
37****************** 37******************
38 38
39NOTE: The Acer Aspire One is not supported hardware. It cannot work with
40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
41blacklisted until that happens.
42
39Please see the website for the current list of known working hardare: 43Please see the website for the current list of known working hardare:
40 44
41http://code.google.com/p/aceracpi/wiki/SupportedHardware 45http://code.google.com/p/aceracpi/wiki/SupportedHardware
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 41bc99fa1884..3d7650768bb5 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -20,7 +20,8 @@ moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
20kernel 2.6.29 and release 0.22. 20kernel 2.6.29 and release 0.22.
21 21
22The driver is named "thinkpad-acpi". In some places, like module 22The driver is named "thinkpad-acpi". In some places, like module
23names, "thinkpad_acpi" is used because of userspace issues. 23names and log messages, "thinkpad_acpi" is used because of userspace
24issues.
24 25
25"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too 26"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
26long due to length limitations on some Linux kernel versions. 27long due to length limitations on some Linux kernel versions.
@@ -37,7 +38,7 @@ detailed description):
37 - ThinkLight on and off 38 - ThinkLight on and off
38 - limited docking and undocking 39 - limited docking and undocking
39 - UltraBay eject 40 - UltraBay eject
40 - CMOS control 41 - CMOS/UCMS control
41 - LED control 42 - LED control
42 - ACPI sounds 43 - ACPI sounds
43 - temperature sensors 44 - temperature sensors
@@ -46,6 +47,7 @@ detailed description):
46 - Volume control 47 - Volume control
47 - Fan control and monitoring: fan speed, fan enable/disable 48 - Fan control and monitoring: fan speed, fan enable/disable
48 - WAN enable and disable 49 - WAN enable and disable
50 - UWB enable and disable
49 51
50A compatibility table by model and feature is maintained on the web 52A compatibility table by model and feature is maintained on the web
51site, http://ibm-acpi.sf.net/. I appreciate any success or failure 53site, http://ibm-acpi.sf.net/. I appreciate any success or failure
@@ -53,7 +55,7 @@ reports, especially if they add to or correct the compatibility table.
53Please include the following information in your report: 55Please include the following information in your report:
54 56
55 - ThinkPad model name 57 - ThinkPad model name
56 - a copy of your DSDT, from /proc/acpi/dsdt 58 - a copy of your ACPI tables, using the "acpidump" utility
57 - a copy of the output of dmidecode, with serial numbers 59 - a copy of the output of dmidecode, with serial numbers
58 and UUIDs masked off 60 and UUIDs masked off
59 - which driver features work and which don't 61 - which driver features work and which don't
@@ -66,17 +68,18 @@ Installation
66------------ 68------------
67 69
68If you are compiling this driver as included in the Linux kernel 70If you are compiling this driver as included in the Linux kernel
69sources, simply enable the CONFIG_THINKPAD_ACPI option, and optionally 71sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
70enable the CONFIG_THINKPAD_ACPI_BAY option if you want the 72It is located on the menu path: "Device Drivers" -> "X86 Platform
71thinkpad-specific bay functionality. 73Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
74
72 75
73Features 76Features
74-------- 77--------
75 78
76The driver exports two different interfaces to userspace, which can be 79The driver exports two different interfaces to userspace, which can be
77used to access the features it provides. One is a legacy procfs-based 80used to access the features it provides. One is a legacy procfs-based
78interface, which will be removed at some time in the distant future. 81interface, which will be removed at some time in the future. The other
79The other is a new sysfs-based interface which is not complete yet. 82is a new sysfs-based interface which is not complete yet.
80 83
81The procfs interface creates the /proc/acpi/ibm directory. There is a 84The procfs interface creates the /proc/acpi/ibm directory. There is a
82file under that directory for each feature it supports. The procfs 85file under that directory for each feature it supports. The procfs
@@ -111,15 +114,17 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
111as a driver attribute (see below). 114as a driver attribute (see below).
112 115
113Sysfs driver attributes are on the driver's sysfs attribute space, 116Sysfs driver attributes are on the driver's sysfs attribute space,
114for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and 117for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
115/sys/bus/platform/drivers/thinkpad_hwmon/ 118/sys/bus/platform/drivers/thinkpad_hwmon/
116 119
117Sysfs device attributes are on the thinkpad_acpi device sysfs attribute 120Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
118space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/. 121space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
119 122
120Sysfs device attributes for the sensors and fan are on the 123Sysfs device attributes for the sensors and fan are on the
121thinkpad_hwmon device's sysfs attribute space, but you should locate it 124thinkpad_hwmon device's sysfs attribute space, but you should locate it
122looking for a hwmon device with the name attribute of "thinkpad". 125looking for a hwmon device with the name attribute of "thinkpad", or
126better yet, through libsensors.
127
123 128
124Driver version 129Driver version
125-------------- 130--------------
@@ -129,6 +134,7 @@ sysfs driver attribute: version
129 134
130The driver name and version. No commands can be written to this file. 135The driver name and version. No commands can be written to this file.
131 136
137
132Sysfs interface version 138Sysfs interface version
133----------------------- 139-----------------------
134 140
@@ -160,6 +166,7 @@ expect that an attribute might not be there, and deal with it properly
160(an attribute not being there *is* a valid way to make it clear that a 166(an attribute not being there *is* a valid way to make it clear that a
161feature is not available in sysfs). 167feature is not available in sysfs).
162 168
169
163Hot keys 170Hot keys
164-------- 171--------
165 172
@@ -172,17 +179,14 @@ system. Enabling the hotkey functionality of thinkpad-acpi signals the
172firmware that such a driver is present, and modifies how the ThinkPad 179firmware that such a driver is present, and modifies how the ThinkPad
173firmware will behave in many situations. 180firmware will behave in many situations.
174 181
175The driver enables the hot key feature automatically when loaded. The 182The driver enables the HKEY ("hot key") event reporting automatically
176feature can later be disabled and enabled back at runtime. The driver 183when loaded, and disables it when it is removed.
177will also restore the hot key feature to its previous state and mask
178when it is unloaded.
179 184
180When the hotkey feature is enabled and the hot key mask is set (see 185The driver will report HKEY events in the following format:
181below), the driver will report HKEY events in the following format:
182 186
183 ibm/hotkey HKEY 00000080 0000xxxx 187 ibm/hotkey HKEY 00000080 0000xxxx
184 188
185Some of these events refer to hot key presses, but not all. 189Some of these events refer to hot key presses, but not all of them.
186 190
187The driver will generate events over the input layer for hot keys and 191The driver will generate events over the input layer for hot keys and
188radio switches, and over the ACPI netlink layer for other events. The 192radio switches, and over the ACPI netlink layer for other events. The
@@ -214,13 +218,17 @@ procfs notes:
214 218
215The following commands can be written to the /proc/acpi/ibm/hotkey file: 219The following commands can be written to the /proc/acpi/ibm/hotkey file:
216 220
217 echo enable > /proc/acpi/ibm/hotkey -- enable the hot keys feature
218 echo disable > /proc/acpi/ibm/hotkey -- disable the hot keys feature
219 echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys 221 echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
220 echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys 222 echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
221 ... any other 8-hex-digit mask ... 223 ... any other 8-hex-digit mask ...
222 echo reset > /proc/acpi/ibm/hotkey -- restore the original mask 224 echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
223 225
226The following commands have been deprecated and will cause the kernel
227to log a warning:
228
229 echo enable > /proc/acpi/ibm/hotkey -- does nothing
230 echo disable > /proc/acpi/ibm/hotkey -- returns an error
231
224The procfs interface does not support NVRAM polling control. So as to 232The procfs interface does not support NVRAM polling control. So as to
225maintain maximum bug-to-bug compatibility, it does not report any masks, 233maintain maximum bug-to-bug compatibility, it does not report any masks,
226nor does it allow one to manipulate the hot key mask when the firmware 234nor does it allow one to manipulate the hot key mask when the firmware
@@ -229,12 +237,9 @@ does not support masks at all, even if NVRAM polling is in use.
229sysfs notes: 237sysfs notes:
230 238
231 hotkey_bios_enabled: 239 hotkey_bios_enabled:
232 Returns the status of the hot keys feature when 240 DEPRECATED, WILL BE REMOVED SOON.
233 thinkpad-acpi was loaded. Upon module unload, the hot
234 key feature status will be restored to this value.
235 241
236 0: hot keys were disabled 242 Returns 0.
237 1: hot keys were enabled (unusual)
238 243
239 hotkey_bios_mask: 244 hotkey_bios_mask:
240 Returns the hot keys mask when thinkpad-acpi was loaded. 245 Returns the hot keys mask when thinkpad-acpi was loaded.
@@ -242,13 +247,10 @@ sysfs notes:
242 to this value. 247 to this value.
243 248
244 hotkey_enable: 249 hotkey_enable:
245 Enables/disables the hot keys feature in the ACPI 250 DEPRECATED, WILL BE REMOVED SOON.
246 firmware, and reports current status of the hot keys
247 feature. Has no effect on the NVRAM hot key polling
248 functionality.
249 251
250 0: disables the hot keys feature / feature disabled 252 0: returns -EPERM
251 1: enables the hot keys feature / feature enabled 253 1: does nothing
252 254
253 hotkey_mask: 255 hotkey_mask:
254 bit mask to enable driver-handling (and depending on 256 bit mask to enable driver-handling (and depending on
@@ -618,6 +620,7 @@ For Lenovo models *with* ACPI backlight control:
618 and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process 620 and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN. Process
619 these keys on userspace somehow (e.g. by calling xbacklight). 621 these keys on userspace somehow (e.g. by calling xbacklight).
620 622
623
621Bluetooth 624Bluetooth
622--------- 625---------
623 626
@@ -628,6 +631,9 @@ sysfs rfkill class: switch "tpacpi_bluetooth_sw"
628This feature shows the presence and current state of a ThinkPad 631This feature shows the presence and current state of a ThinkPad
629Bluetooth device in the internal ThinkPad CDC slot. 632Bluetooth device in the internal ThinkPad CDC slot.
630 633
634If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
635so it is kept across reboots and power-off.
636
631Procfs notes: 637Procfs notes:
632 638
633If Bluetooth is installed, the following commands can be used: 639If Bluetooth is installed, the following commands can be used:
@@ -652,6 +658,7 @@ Sysfs notes:
652 rfkill controller switch "tpacpi_bluetooth_sw": refer to 658 rfkill controller switch "tpacpi_bluetooth_sw": refer to
653 Documentation/rfkill.txt for details. 659 Documentation/rfkill.txt for details.
654 660
661
655Video output control -- /proc/acpi/ibm/video 662Video output control -- /proc/acpi/ibm/video
656-------------------------------------------- 663--------------------------------------------
657 664
@@ -693,11 +700,8 @@ Fn-F7 from working. This also disables the video output switching
693features of this driver, as it uses the same ACPI methods as 700features of this driver, as it uses the same ACPI methods as
694Fn-F7. Video switching on the console should still work. 701Fn-F7. Video switching on the console should still work.
695 702
696UPDATE: There's now a patch for the X.org Radeon driver which 703UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
697addresses this issue. Some people are reporting success with the patch
698while others are still having problems. For more information:
699 704
700https://bugs.freedesktop.org/show_bug.cgi?id=2000
701 705
702ThinkLight control 706ThinkLight control
703------------------ 707------------------
@@ -720,10 +724,11 @@ The ThinkLight sysfs interface is documented by the LED class
720documentation, in Documentation/leds-class.txt. The ThinkLight LED name 724documentation, in Documentation/leds-class.txt. The ThinkLight LED name
721is "tpacpi::thinklight". 725is "tpacpi::thinklight".
722 726
723Due to limitations in the sysfs LED class, if the status of the thinklight 727Due to limitations in the sysfs LED class, if the status of the ThinkLight
724cannot be read or if it is unknown, thinkpad-acpi will report it as "off". 728cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
725It is impossible to know if the status returned through sysfs is valid. 729It is impossible to know if the status returned through sysfs is valid.
726 730
731
727Docking / undocking -- /proc/acpi/ibm/dock 732Docking / undocking -- /proc/acpi/ibm/dock
728------------------------------------------ 733------------------------------------------
729 734
@@ -784,6 +789,7 @@ the only docking stations currently supported are the X-series
784UltraBase docks and "dumb" port replicators like the Mini Dock (the 789UltraBase docks and "dumb" port replicators like the Mini Dock (the
785latter don't need any ACPI support, actually). 790latter don't need any ACPI support, actually).
786 791
792
787UltraBay eject -- /proc/acpi/ibm/bay 793UltraBay eject -- /proc/acpi/ibm/bay
788------------------------------------ 794------------------------------------
789 795
@@ -847,8 +853,9 @@ supported. Use "eject2" instead of "eject" for the second bay.
847Note: the UltraBay eject support on the 600e/x, A22p and A3x is 853Note: the UltraBay eject support on the 600e/x, A22p and A3x is
848EXPERIMENTAL and may not work as expected. USE WITH CAUTION! 854EXPERIMENTAL and may not work as expected. USE WITH CAUTION!
849 855
850CMOS control 856
851------------ 857CMOS/UCMS control
858-----------------
852 859
853procfs: /proc/acpi/ibm/cmos 860procfs: /proc/acpi/ibm/cmos
854sysfs device attribute: cmos_command 861sysfs device attribute: cmos_command
@@ -882,6 +889,7 @@ The cmos command interface is prone to firmware split-brain problems, as
882in newer ThinkPads it is just a compatibility layer. Do not use it, it is 889in newer ThinkPads it is just a compatibility layer. Do not use it, it is
883exported just as a debug tool. 890exported just as a debug tool.
884 891
892
885LED control 893LED control
886----------- 894-----------
887 895
@@ -893,6 +901,17 @@ some older ThinkPad models, it is possible to query the status of the
893LED indicators as well. Newer ThinkPads cannot query the real status 901LED indicators as well. Newer ThinkPads cannot query the real status
894of the LED indicators. 902of the LED indicators.
895 903
904Because misuse of the LEDs could induce an unaware user to perform
905dangerous actions (like undocking or ejecting a bay device while the
906buses are still active), or mask an important alarm (such as a nearly
907empty battery, or a broken battery), access to most LEDs is
908restricted.
909
910Unrestricted access to all LEDs requires that thinkpad-acpi be
911compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
912Distributions must never enable this option. Individual users that
913are aware of the consequences are welcome to enabling it.
914
896procfs notes: 915procfs notes:
897 916
898The available commands are: 917The available commands are:
@@ -939,6 +958,7 @@ ThinkPad indicator LED should blink in hardware accelerated mode, use the
939"timer" trigger, and leave the delay_on and delay_off parameters set to 958"timer" trigger, and leave the delay_on and delay_off parameters set to
940zero (to request hardware acceleration autodetection). 959zero (to request hardware acceleration autodetection).
941 960
961
942ACPI sounds -- /proc/acpi/ibm/beep 962ACPI sounds -- /proc/acpi/ibm/beep
943---------------------------------- 963----------------------------------
944 964
@@ -968,6 +988,7 @@ X40:
968 16 - one medium-pitched beep repeating constantly, stop with 17 988 16 - one medium-pitched beep repeating constantly, stop with 17
969 17 - stop 16 989 17 - stop 16
970 990
991
971Temperature sensors 992Temperature sensors
972------------------- 993-------------------
973 994
@@ -1115,6 +1136,7 @@ registers contain the current battery capacity, etc. If you experiment
1115with this, do send me your results (including some complete dumps with 1136with this, do send me your results (including some complete dumps with
1116a description of the conditions when they were taken.) 1137a description of the conditions when they were taken.)
1117 1138
1139
1118LCD brightness control 1140LCD brightness control
1119---------------------- 1141----------------------
1120 1142
@@ -1124,10 +1146,9 @@ sysfs backlight device "thinkpad_screen"
1124This feature allows software control of the LCD brightness on ThinkPad 1146This feature allows software control of the LCD brightness on ThinkPad
1125models which don't have a hardware brightness slider. 1147models which don't have a hardware brightness slider.
1126 1148
1127It has some limitations: the LCD backlight cannot be actually turned on or 1149It has some limitations: the LCD backlight cannot be actually turned
1128off by this interface, and in many ThinkPad models, the "dim while on 1150on or off by this interface, it just controls the backlight brightness
1129battery" functionality will be enabled by the BIOS when this interface is 1151level.
1130used, and cannot be controlled.
1131 1152
1132On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control 1153On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
1133has eight brightness levels, ranging from 0 to 7. Some of the levels 1154has eight brightness levels, ranging from 0 to 7. Some of the levels
@@ -1136,10 +1157,15 @@ display backlight brightness control methods have 16 levels, ranging
1136from 0 to 15. 1157from 0 to 15.
1137 1158
1138There are two interfaces to the firmware for direct brightness control, 1159There are two interfaces to the firmware for direct brightness control,
1139EC and CMOS. To select which one should be used, use the 1160EC and UCMS (or CMOS). To select which one should be used, use the
1140brightness_mode module parameter: brightness_mode=1 selects EC mode, 1161brightness_mode module parameter: brightness_mode=1 selects EC mode,
1141brightness_mode=2 selects CMOS mode, brightness_mode=3 selects both EC 1162brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
1142and CMOS. The driver tries to auto-detect which interface to use. 1163mode with NVRAM backing (so that brightness changes are remembered
1164across shutdown/reboot).
1165
1166The driver tries to select which interface to use from a table of
1167defaults for each ThinkPad model. If it makes a wrong choice, please
1168report this as a bug, so that we can fix it.
1143 1169
1144When display backlight brightness controls are available through the 1170When display backlight brightness controls are available through the
1145standard ACPI interface, it is best to use it instead of this direct 1171standard ACPI interface, it is best to use it instead of this direct
@@ -1201,6 +1227,7 @@ WARNING:
1201 and maybe reduce the life of the backlight lamps by needlessly kicking 1227 and maybe reduce the life of the backlight lamps by needlessly kicking
1202 its level up and down at every change. 1228 its level up and down at every change.
1203 1229
1230
1204Volume control -- /proc/acpi/ibm/volume 1231Volume control -- /proc/acpi/ibm/volume
1205--------------------------------------- 1232---------------------------------------
1206 1233
@@ -1217,6 +1244,11 @@ distinct. The unmute the volume after the mute command, use either the
1217up or down command (the level command will not unmute the volume). 1244up or down command (the level command will not unmute the volume).
1218The current volume level and mute state is shown in the file. 1245The current volume level and mute state is shown in the file.
1219 1246
1247The ALSA mixer interface to this feature is still missing, but patches
1248to add it exist. That problem should be addressed in the not so
1249distant future.
1250
1251
1220Fan control and monitoring: fan speed, fan enable/disable 1252Fan control and monitoring: fan speed, fan enable/disable
1221--------------------------------------------------------- 1253---------------------------------------------------------
1222 1254
@@ -1383,8 +1415,11 @@ procfs: /proc/acpi/ibm/wan
1383sysfs device attribute: wwan_enable (deprecated) 1415sysfs device attribute: wwan_enable (deprecated)
1384sysfs rfkill class: switch "tpacpi_wwan_sw" 1416sysfs rfkill class: switch "tpacpi_wwan_sw"
1385 1417
1386This feature shows the presence and current state of a W-WAN (Sierra 1418This feature shows the presence and current state of the built-in
1387Wireless EV-DO) device. 1419Wireless WAN device.
1420
1421If the ThinkPad supports it, the WWAN state is stored in NVRAM,
1422so it is kept across reboots and power-off.
1388 1423
1389It was tested on a Lenovo ThinkPad X60. It should probably work on other 1424It was tested on a Lenovo ThinkPad X60. It should probably work on other
1390ThinkPad models which come with this module installed. 1425ThinkPad models which come with this module installed.
@@ -1413,6 +1448,7 @@ Sysfs notes:
1413 rfkill controller switch "tpacpi_wwan_sw": refer to 1448 rfkill controller switch "tpacpi_wwan_sw": refer to
1414 Documentation/rfkill.txt for details. 1449 Documentation/rfkill.txt for details.
1415 1450
1451
1416EXPERIMENTAL: UWB 1452EXPERIMENTAL: UWB
1417----------------- 1453-----------------
1418 1454
@@ -1431,6 +1467,7 @@ Sysfs notes:
1431 rfkill controller switch "tpacpi_uwb_sw": refer to 1467 rfkill controller switch "tpacpi_uwb_sw": refer to
1432 Documentation/rfkill.txt for details. 1468 Documentation/rfkill.txt for details.
1433 1469
1470
1434Multiple Commands, Module Parameters 1471Multiple Commands, Module Parameters
1435------------------------------------ 1472------------------------------------
1436 1473
@@ -1445,6 +1482,7 @@ for example:
1445 1482
1446 modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable 1483 modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
1447 1484
1485
1448Enabling debugging output 1486Enabling debugging output
1449------------------------- 1487-------------------------
1450 1488
@@ -1457,8 +1495,15 @@ will enable all debugging output classes. It takes a bitmask, so
1457to enable more than one output class, just add their values. 1495to enable more than one output class, just add their values.
1458 1496
1459 Debug bitmask Description 1497 Debug bitmask Description
1498 0x8000 Disclose PID of userspace programs
1499 accessing some functions of the driver
1460 0x0001 Initialization and probing 1500 0x0001 Initialization and probing
1461 0x0002 Removal 1501 0x0002 Removal
1502 0x0004 RF Transmitter control (RFKILL)
1503 (bluetooth, WWAN, UWB...)
1504 0x0008 HKEY event interface, hotkeys
1505 0x0010 Fan control
1506 0x0020 Backlight brightness
1462 1507
1463There is also a kernel build option to enable more debugging 1508There is also a kernel build option to enable more debugging
1464information, which may be necessary to debug driver problems. 1509information, which may be necessary to debug driver problems.
@@ -1467,6 +1512,7 @@ The level of debugging information output by the driver can be changed
1467at runtime through sysfs, using the driver attribute debug_level. The 1512at runtime through sysfs, using the driver attribute debug_level. The
1468attribute takes the same bitmask as the debug module parameter above. 1513attribute takes the same bitmask as the debug module parameter above.
1469 1514
1515
1470Force loading of module 1516Force loading of module
1471----------------------- 1517-----------------------
1472 1518
@@ -1505,3 +1551,7 @@ Sysfs interface changelog:
1505 1551
15060x020200: Add poll()/select() support to the following attributes: 15520x020200: Add poll()/select() support to the following attributes:
1507 hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason 1553 hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
1554
15550x020300: hotkey enable/disable support removed, attributes
1556 hotkey_bios_enabled and hotkey_enable deprecated and
1557 marked for removal.
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index f2dbbf3bdeab..d36fcc0f2715 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1630,6 +1630,13 @@ static bool service_io(struct device *dev)
1630 } 1630 }
1631 } 1631 }
1632 1632
1633 /* OK, so we noted that it was pretty poor to use an fdatasync as a
1634 * barrier. But Christoph Hellwig points out that we need a sync
1635 * *afterwards* as well: "Barriers specify no reordering to the front
1636 * or the back." And Jens Axboe confirmed it, so here we are: */
1637 if (out->type & VIRTIO_BLK_T_BARRIER)
1638 fdatasync(vblk->fd);
1639
1633 /* We can't trigger an IRQ, because we're not the Launcher. It does 1640 /* We can't trigger an IRQ, because we're not the Launcher. It does
1634 * that when we tell it we're done. */ 1641 * that when we tell it we're done. */
1635 add_used(dev->vq, head, wlen); 1642 add_used(dev->vq, head, wlen);
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index 488773018152..938ea22f2cc0 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -27,33 +27,37 @@ lock-class.
27State 27State
28----- 28-----
29 29
30The validator tracks lock-class usage history into 5 separate state bits: 30The validator tracks lock-class usage history into 4n + 1 separate state bits:
31 31
32- 'ever held in hardirq context' [ == hardirq-safe ] 32- 'ever held in STATE context'
33- 'ever held in softirq context' [ == softirq-safe ] 33- 'ever head as readlock in STATE context'
34- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] 34- 'ever head with STATE enabled'
35- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] 35- 'ever head as readlock with STATE enabled'
36
37Where STATE can be either one of (kernel/lockdep_states.h)
38 - hardirq
39 - softirq
40 - reclaim_fs
36 41
37- 'ever used' [ == !unused ] 42- 'ever used' [ == !unused ]
38 43
39When locking rules are violated, these 4 state bits are presented in the 44When locking rules are violated, these state bits are presented in the
40locking error messages, inside curlies. A contrived example: 45locking error messages, inside curlies. A contrived example:
41 46
42 modprobe/2287 is trying to acquire lock: 47 modprobe/2287 is trying to acquire lock:
43 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 48 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
44 49
45 but task is already holding lock: 50 but task is already holding lock:
46 (&sio_locks[i].lock){--..}, at: [<c02867fd>] mutex_lock+0x21/0x24 51 (&sio_locks[i].lock){-.-...}, at: [<c02867fd>] mutex_lock+0x21/0x24
47 52
48 53
49The bit position indicates hardirq, softirq, hardirq-read, 54The bit position indicates STATE, STATE-read, for each of the states listed
50softirq-read respectively, and the character displayed in each 55above, and the character displayed in each indicates:
51indicates:
52 56
53 '.' acquired while irqs disabled 57 '.' acquired while irqs disabled
54 '+' acquired in irq context 58 '+' acquired in irq context
55 '-' acquired with irqs enabled 59 '-' acquired with irqs enabled
56 '?' read acquired in irq context with irqs enabled. 60 '?' acquired in irq context with irqs enabled.
57 61
58Unused mutexes cannot be part of the cause of an error. 62Unused mutexes cannot be part of the cause of an error.
59 63
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 1da9d1b1793f..4edd39ec7db9 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -164,15 +164,19 @@ All md devices contain:
164 raid_disks 164 raid_disks
165 a text file with a simple number indicating the number of devices 165 a text file with a simple number indicating the number of devices
166 in a fully functional array. If this is not yet known, the file 166 in a fully functional array. If this is not yet known, the file
167 will be empty. If an array is being resized (not currently 167 will be empty. If an array is being resized this will contain
168 possible) this will contain the larger of the old and new sizes. 168 the new number of devices.
169 Some raid level (RAID1) allow this value to be set while the 169 Some raid levels allow this value to be set while the array is
170 array is active. This will reconfigure the array. Otherwise 170 active. This will reconfigure the array. Otherwise it can only
171 it can only be set while assembling an array. 171 be set while assembling an array.
172 A change to this attribute will not be permitted if it would
173 reduce the size of the array. To reduce the number of drives
174 in an e.g. raid5, the array size must first be reduced by
175 setting the 'array_size' attribute.
172 176
173 chunk_size 177 chunk_size
174 This is the size if bytes for 'chunks' and is only relevant to 178 This is the size in bytes for 'chunks' and is only relevant to
175 raid levels that involve striping (1,4,5,6,10). The address space 179 raid levels that involve striping (0,4,5,6,10). The address space
176 of the array is conceptually divided into chunks and consecutive 180 of the array is conceptually divided into chunks and consecutive
177 chunks are striped onto neighbouring devices. 181 chunks are striped onto neighbouring devices.
178 The size should be at least PAGE_SIZE (4k) and should be a power 182 The size should be at least PAGE_SIZE (4k) and should be a power
@@ -183,6 +187,20 @@ All md devices contain:
183 simply a number that is interpretted differently by different 187 simply a number that is interpretted differently by different
184 levels. It can be written while assembling an array. 188 levels. It can be written while assembling an array.
185 189
190 array_size
191 This can be used to artificially constrain the available space in
192 the array to be less than is actually available on the combined
193 devices. Writing a number (in Kilobytes) which is less than
194 the available size will set the size. Any reconfiguration of the
195 array (e.g. adding devices) will not cause the size to change.
196 Writing the word 'default' will cause the effective size of the
197 array to be whatever size is actually available based on
198 'level', 'chunk_size' and 'component_size'.
199
200 This can be used to reduce the size of the array before reducing
201 the number of devices in a raid4/5/6, or to support external
202 metadata formats which mandate such clipping.
203
186 reshape_position 204 reshape_position
187 This is either "none" or a sector number within the devices of 205 This is either "none" or a sector number within the devices of
188 the array where "reshape" is up to. If this is set, the three 206 the array where "reshape" is up to. If this is set, the three
@@ -207,6 +225,11 @@ All md devices contain:
207 about the array. It can be 0.90 (traditional format), 1.0, 1.1, 225 about the array. It can be 0.90 (traditional format), 1.0, 1.1,
208 1.2 (newer format in varying locations) or "none" indicating that 226 1.2 (newer format in varying locations) or "none" indicating that
209 the kernel isn't managing metadata at all. 227 the kernel isn't managing metadata at all.
228 Alternately it can be "external:" followed by a string which
229 is set by user-space. This indicates that metadata is managed
230 by a user-space program. Any device failure or other event that
231 requires a metadata update will cause array activity to be
232 suspended until the event is acknowledged.
210 233
211 resync_start 234 resync_start
212 The point at which resync should start. If no resync is needed, 235 The point at which resync should start. If no resync is needed,
diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003
new file mode 100644
index 000000000000..c4ff5f38e010
--- /dev/null
+++ b/Documentation/misc-devices/isl29003
@@ -0,0 +1,62 @@
1Kernel driver isl29003
2=====================
3
4Supported chips:
5* Intersil ISL29003
6Prefix: 'isl29003'
7Addresses scanned: none
8Datasheet:
9http://www.intersil.com/data/fn/fn7464.pdf
10
11Author: Daniel Mack <daniel@caiaq.de>
12
13
14Description
15-----------
16The ISL29003 is an integrated light sensor with a 16-bit integrating type
17ADC, I2C user programmable lux range select for optimized counts/lux, and
18I2C multi-function control and monitoring capabilities. The internal ADC
19provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
20artificial light sources.
21
22The driver allows to set the lux range, the bit resolution, the operational
23mode (see below) and the power state of device and can read the current lux
24value, of course.
25
26
27Detection
28---------
29
30The ISL29003 does not have an ID register which could be used to identify
31it, so the detection routine will just try to read from the configured I2C
32addess and consider the device to be present as soon as it ACKs the
33transfer.
34
35
36Sysfs entries
37-------------
38
39range:
40 0: 0 lux to 1000 lux (default)
41 1: 0 lux to 4000 lux
42 2: 0 lux to 16,000 lux
43 3: 0 lux to 64,000 lux
44
45resolution:
46 0: 2^16 cycles (default)
47 1: 2^12 cycles
48 2: 2^8 cycles
49 3: 2^4 cycles
50
51mode:
52 0: diode1's current (unsigned 16bit) (default)
53 1: diode1's current (unsigned 16bit)
54 2: difference between diodes (l1 - l2, signed 15bit)
55
56power_state:
57 0: device is disabled (default)
58 1: device is enabled
59
60lux (read only):
61 returns the value from the last sensor reading
62
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
1* Uploaded QE firmware 1* Uploaded QE firmware
2 2
3 If a new firwmare has been uploaded to the QE (usually by the 3 If a new firmware has been uploaded to the QE (usually by the
4 boot loader), then a 'firmware' child node should be added to the QE 4 boot loader), then a 'firmware' child node should be added to the QE
5 node. This node provides information on the uploaded firmware that 5 node. This node provides information on the uploaded firmware that
6 device drivers may need. 6 device drivers may need.
diff --git a/Documentation/powerpc/dts-bindings/fsl/dma.txt b/Documentation/powerpc/dts-bindings/fsl/dma.txt
index cc453110fc46..0732cdd05ba1 100644
--- a/Documentation/powerpc/dts-bindings/fsl/dma.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/dma.txt
@@ -35,30 +35,30 @@ Example:
35 #address-cells = <1>; 35 #address-cells = <1>;
36 #size-cells = <1>; 36 #size-cells = <1>;
37 compatible = "fsl,mpc8349-dma", "fsl,elo-dma"; 37 compatible = "fsl,mpc8349-dma", "fsl,elo-dma";
38 reg = <82a8 4>; 38 reg = <0x82a8 4>;
39 ranges = <0 8100 1a4>; 39 ranges = <0 0x8100 0x1a4>;
40 interrupt-parent = <&ipic>; 40 interrupt-parent = <&ipic>;
41 interrupts = <47 8>; 41 interrupts = <71 8>;
42 cell-index = <0>; 42 cell-index = <0>;
43 dma-channel@0 { 43 dma-channel@0 {
44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 44 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
45 cell-index = <0>; 45 cell-index = <0>;
46 reg = <0 80>; 46 reg = <0 0x80>;
47 }; 47 };
48 dma-channel@80 { 48 dma-channel@80 {
49 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 49 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
50 cell-index = <1>; 50 cell-index = <1>;
51 reg = <80 80>; 51 reg = <0x80 0x80>;
52 }; 52 };
53 dma-channel@100 { 53 dma-channel@100 {
54 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 54 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
55 cell-index = <2>; 55 cell-index = <2>;
56 reg = <100 80>; 56 reg = <0x100 0x80>;
57 }; 57 };
58 dma-channel@180 { 58 dma-channel@180 {
59 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel"; 59 compatible = "fsl,mpc8349-dma-channel", "fsl,elo-dma-channel";
60 cell-index = <3>; 60 cell-index = <3>;
61 reg = <180 80>; 61 reg = <0x180 0x80>;
62 }; 62 };
63 }; 63 };
64 64
@@ -93,36 +93,36 @@ Example:
93 #address-cells = <1>; 93 #address-cells = <1>;
94 #size-cells = <1>; 94 #size-cells = <1>;
95 compatible = "fsl,mpc8540-dma", "fsl,eloplus-dma"; 95 compatible = "fsl,mpc8540-dma", "fsl,eloplus-dma";
96 reg = <21300 4>; 96 reg = <0x21300 4>;
97 ranges = <0 21100 200>; 97 ranges = <0 0x21100 0x200>;
98 cell-index = <0>; 98 cell-index = <0>;
99 dma-channel@0 { 99 dma-channel@0 {
100 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 100 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
101 reg = <0 80>; 101 reg = <0 0x80>;
102 cell-index = <0>; 102 cell-index = <0>;
103 interrupt-parent = <&mpic>; 103 interrupt-parent = <&mpic>;
104 interrupts = <14 2>; 104 interrupts = <20 2>;
105 }; 105 };
106 dma-channel@80 { 106 dma-channel@80 {
107 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 107 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
108 reg = <80 80>; 108 reg = <0x80 0x80>;
109 cell-index = <1>; 109 cell-index = <1>;
110 interrupt-parent = <&mpic>; 110 interrupt-parent = <&mpic>;
111 interrupts = <15 2>; 111 interrupts = <21 2>;
112 }; 112 };
113 dma-channel@100 { 113 dma-channel@100 {
114 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 114 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
115 reg = <100 80>; 115 reg = <0x100 0x80>;
116 cell-index = <2>; 116 cell-index = <2>;
117 interrupt-parent = <&mpic>; 117 interrupt-parent = <&mpic>;
118 interrupts = <16 2>; 118 interrupts = <22 2>;
119 }; 119 };
120 dma-channel@180 { 120 dma-channel@180 {
121 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel"; 121 compatible = "fsl,mpc8540-dma-channel", "fsl,eloplus-dma-channel";
122 reg = <180 80>; 122 reg = <0x180 0x80>;
123 cell-index = <3>; 123 cell-index = <3>;
124 interrupt-parent = <&mpic>; 124 interrupt-parent = <&mpic>;
125 interrupts = <17 2>; 125 interrupts = <23 2>;
126 }; 126 };
127 }; 127 };
128 128
diff --git a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
new file mode 100644
index 000000000000..600846557763
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
@@ -0,0 +1,24 @@
1* Freescale Enhanced Secure Digital Host Controller (eSDHC)
2
3The Enhanced Secure Digital Host Controller provides an interface
4for MMC, SD, and SDIO types of memory cards.
5
6Required properties:
7 - compatible : should be
8 "fsl,<chip>-esdhc", "fsl,mpc8379-esdhc" for MPC83xx processors.
9 "fsl,<chip>-esdhc", "fsl,mpc8536-esdhc" for MPC85xx processors.
10 - reg : should contain eSDHC registers location and length.
11 - interrupts : should contain eSDHC interrupt.
12 - interrupt-parent : interrupt source phandle.
13 - clock-frequency : specifies eSDHC base clock frequency.
14
15Example:
16
17sdhci@2e000 {
18 compatible = "fsl,mpc8378-esdhc", "fsl,mpc8379-esdhc";
19 reg = <0x2e000 0x1000>;
20 interrupts = <42 0x8>;
21 interrupt-parent = <&ipic>;
22 /* Filled in by U-Boot */
23 clock-frequency = <0>;
24};
diff --git a/Documentation/powerpc/dts-bindings/fsl/ssi.txt b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
index a2d963998a65..5ff76c9c57d2 100644
--- a/Documentation/powerpc/dts-bindings/fsl/ssi.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/ssi.txt
@@ -4,44 +4,56 @@ The SSI is a serial device that communicates with audio codecs. It can
4be programmed in AC97, I2S, left-justified, or right-justified modes. 4be programmed in AC97, I2S, left-justified, or right-justified modes.
5 5
6Required properties: 6Required properties:
7- compatible : compatible list, containing "fsl,ssi" 7- compatible: Compatible list, contains "fsl,ssi".
8- cell-index : the SSI, <0> = SSI1, <1> = SSI2, and so on 8- cell-index: The SSI, <0> = SSI1, <1> = SSI2, and so on.
9- reg : offset and length of the register set for the device 9- reg: Offset and length of the register set for the device.
10- interrupts : <a b> where a is the interrupt number and b is a 10- interrupts: <a b> where a is the interrupt number and b is a
11 field that represents an encoding of the sense and 11 field that represents an encoding of the sense and
12 level information for the interrupt. This should be 12 level information for the interrupt. This should be
13 encoded based on the information in section 2) 13 encoded based on the information in section 2)
14 depending on the type of interrupt controller you 14 depending on the type of interrupt controller you
15 have. 15 have.
16- interrupt-parent : the phandle for the interrupt controller that 16- interrupt-parent: The phandle for the interrupt controller that
17 services interrupts for this device. 17 services interrupts for this device.
18- fsl,mode : the operating mode for the SSI interface 18- fsl,mode: The operating mode for the SSI interface.
19 "i2s-slave" - I2S mode, SSI is clock slave 19 "i2s-slave" - I2S mode, SSI is clock slave
20 "i2s-master" - I2S mode, SSI is clock master 20 "i2s-master" - I2S mode, SSI is clock master
21 "lj-slave" - left-justified mode, SSI is clock slave 21 "lj-slave" - left-justified mode, SSI is clock slave
22 "lj-master" - l.j. mode, SSI is clock master 22 "lj-master" - l.j. mode, SSI is clock master
23 "rj-slave" - right-justified mode, SSI is clock slave 23 "rj-slave" - right-justified mode, SSI is clock slave
24 "rj-master" - r.j., SSI is clock master 24 "rj-master" - r.j., SSI is clock master
25 "ac97-slave" - AC97 mode, SSI is clock slave 25 "ac97-slave" - AC97 mode, SSI is clock slave
26 "ac97-master" - AC97 mode, SSI is clock master 26 "ac97-master" - AC97 mode, SSI is clock master
27- fsl,playback-dma: phandle to a node for the DMA channel to use for 27- fsl,playback-dma: Phandle to a node for the DMA channel to use for
28 playback of audio. This is typically dictated by SOC 28 playback of audio. This is typically dictated by SOC
29 design. See the notes below. 29 design. See the notes below.
30- fsl,capture-dma: phandle to a node for the DMA channel to use for 30- fsl,capture-dma: Phandle to a node for the DMA channel to use for
31 capture (recording) of audio. This is typically dictated 31 capture (recording) of audio. This is typically dictated
32 by SOC design. See the notes below. 32 by SOC design. See the notes below.
33- fsl,fifo-depth: The number of elements in the transmit and receive FIFOs.
34 This number is the maximum allowed value for SFCSR[TFWM0].
35- fsl,ssi-asynchronous:
36 If specified, the SSI is to be programmed in asynchronous
37 mode. In this mode, pins SRCK, STCK, SRFS, and STFS must
38 all be connected to valid signals. In synchronous mode,
39 SRCK and SRFS are ignored. Asynchronous mode allows
40 playback and capture to use different sample sizes and
41 sample rates. Some drivers may require that SRCK and STCK
42 be connected together, and SRFS and STFS be connected
43 together. This would still allow different sample sizes,
44 but not different sample rates.
33 45
34Optional properties: 46Optional properties:
35- codec-handle : phandle to a 'codec' node that defines an audio 47- codec-handle: Phandle to a 'codec' node that defines an audio
36 codec connected to this SSI. This node is typically 48 codec connected to this SSI. This node is typically
37 a child of an I2C or other control node. 49 a child of an I2C or other control node.
38 50
39Child 'codec' node required properties: 51Child 'codec' node required properties:
40- compatible : compatible list, contains the name of the codec 52- compatible: Compatible list, contains the name of the codec
41 53
42Child 'codec' node optional properties: 54Child 'codec' node optional properties:
43- clock-frequency : The frequency of the input clock, which typically 55- clock-frequency: The frequency of the input clock, which typically comes
44 comes from an on-board dedicated oscillator. 56 from an on-board dedicated oscillator.
45 57
46Notes on fsl,playback-dma and fsl,capture-dma: 58Notes on fsl,playback-dma and fsl,capture-dma:
47 59
diff --git a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
index 84a04d5eb8e6..a48b2cadc7f0 100644
--- a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
@@ -5,9 +5,21 @@ Required properties:
5- reg : should specify localbus chip select and size used for the chip. 5- reg : should specify localbus chip select and size used for the chip.
6- fsl,upm-addr-offset : UPM pattern offset for the address latch. 6- fsl,upm-addr-offset : UPM pattern offset for the address latch.
7- fsl,upm-cmd-offset : UPM pattern offset for the command latch. 7- fsl,upm-cmd-offset : UPM pattern offset for the command latch.
8- gpios : may specify optional GPIO connected to the Ready-Not-Busy pin.
9 8
10Example: 9Optional properties:
10- fsl,upm-wait-flags : add chip-dependent short delays after running the
11 UPM pattern (0x1), after writing a data byte (0x2) or after
12 writing out a buffer (0x4).
13- fsl,upm-addr-line-cs-offsets : address offsets for multi-chip support.
14 The corresponding address lines are used to select the chip.
15- gpios : may specify optional GPIOs connected to the Ready-Not-Busy pins
16 (R/B#). For multi-chip devices, "n" GPIO definitions are required
17 according to the number of chips.
18- chip-delay : chip dependent delay for transfering data from array to
19 read registers (tR). Required if property "gpios" is not used
20 (R/B# pins not connected).
21
22Examples:
11 23
12upm@1,0 { 24upm@1,0 {
13 compatible = "fsl,upm-nand"; 25 compatible = "fsl,upm-nand";
@@ -26,3 +38,26 @@ upm@1,0 {
26 }; 38 };
27 }; 39 };
28}; 40};
41
42upm@3,0 {
43 #address-cells = <0>;
44 #size-cells = <0>;
45 compatible = "tqc,tqm8548-upm-nand", "fsl,upm-nand";
46 reg = <3 0x0 0x800>;
47 fsl,upm-addr-offset = <0x10>;
48 fsl,upm-cmd-offset = <0x08>;
49 /* Multi-chip NAND device */
50 fsl,upm-addr-line-cs-offsets = <0x0 0x200>;
51 fsl,upm-wait-flags = <0x5>;
52 chip-delay = <25>; // in micro-seconds
53
54 nand@0 {
55 #address-cells = <1>;
56 #size-cells = <1>;
57
58 partition@0 {
59 label = "fs";
60 reg = <0x00000000 0x10000000>;
61 };
62 };
63};
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
index ff51f4c0fa9d..4fe14deedc0a 100644
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -1,15 +1,43 @@
1LED connected to GPIO 1LEDs connected to GPIO lines
2 2
3Required properties: 3Required properties:
4- compatible : should be "gpio-led". 4- compatible : should be "gpio-leds".
5- label : (optional) the label for this LED. If omitted, the label is 5
6Each LED is represented as a sub-node of the gpio-leds device. Each
7node's name represents the name of the corresponding LED.
8
9LED sub-node properties:
10- gpios : Should specify the LED's GPIO, see "Specifying GPIO information
11 for devices" in Documentation/powerpc/booting-without-of.txt. Active
12 low LEDs should be indicated using flags in the GPIO specifier.
13- label : (optional) The label for this LED. If omitted, the label is
6 taken from the node name (excluding the unit address). 14 taken from the node name (excluding the unit address).
7- gpios : should specify LED GPIO. 15- linux,default-trigger : (optional) This parameter, if present, is a
16 string defining the trigger assigned to the LED. Current triggers are:
17 "backlight" - LED will act as a back-light, controlled by the framebuffer
18 system
19 "default-on" - LED will turn on
20 "heartbeat" - LED "double" flashes at a load average based rate
21 "ide-disk" - LED indicates disk activity
22 "timer" - LED flashes at a fixed, configurable rate
8 23
9Example: 24Examples:
10 25
11led@0 { 26leds {
12 compatible = "gpio-led"; 27 compatible = "gpio-leds";
13 label = "hdd"; 28 hdd {
14 gpios = <&mcu_pio 0 1>; 29 label = "IDE Activity";
30 gpios = <&mcu_pio 0 1>; /* Active low */
31 linux,default-trigger = "ide-disk";
32 };
15}; 33};
34
35run-control {
36 compatible = "gpio-leds";
37 red {
38 gpios = <&mpc8572 6 0>;
39 };
40 green {
41 gpios = <&mpc8572 7 0>;
42 };
43}
diff --git a/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
new file mode 100644
index 000000000000..c39ac2891951
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/mmc-spi-slot.txt
@@ -0,0 +1,23 @@
1MMC/SD/SDIO slot directly connected to a SPI bus
2
3Required properties:
4- compatible : should be "mmc-spi-slot".
5- reg : should specify SPI address (chip-select number).
6- spi-max-frequency : maximum frequency for this device (Hz).
7- voltage-ranges : two cells are required, first cell specifies minimum
8 slot voltage (mV), second cell specifies maximum slot voltage (mV).
9 Several ranges could be specified.
10- gpios : (optional) may specify GPIOs in this order: Card-Detect GPIO,
11 Write-Protect GPIO.
12
13Example:
14
15 mmc-slot@0 {
16 compatible = "fsl,mpc8323rdb-mmc-slot",
17 "mmc-spi-slot";
18 reg = <0>;
19 gpios = <&qe_pio_d 14 1
20 &qe_pio_d 15 0>;
21 voltage-ranges = <3300 3300>;
22 spi-max-frequency = <50000000>;
23 };
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 3ef339f491e0..5ba4d3fc625a 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,7 +126,7 @@ This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_u
126to control the CPU time reserved for each control group instead. 126to control the CPU time reserved for each control group instead.
127 127
128For more information on working with control groups, you should read 128For more information on working with control groups, you should read
129Documentation/cgroups.txt as well. 129Documentation/cgroups/cgroups.txt as well.
130 130
131Group settings are checked against the following limits in order to keep the configuration 131Group settings are checked against the following limits in order to keep the configuration
132schedulable: 132schedulable:
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
index ddace3afc83b..30f643f611b2 100644
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -60,17 +60,9 @@ Supported Cards/Chipsets
60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite) 60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite)
61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite) 61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite)
62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite) 62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite)
63 9005:0285:9005:02d8 Adaptec 5405G (Voodoo40 PM) 63 9005:0285:9005:02d8 Adaptec 5405Z (Voodoo40 BLBU)
64 9005:0285:9005:02d9 Adaptec 5445G (Voodoo44 PM) 64 9005:0285:9005:02d9 Adaptec 5445Z (Voodoo44 BLBU)
65 9005:0285:9005:02da Adaptec 5805G (Voodoo80 PM) 65 9005:0285:9005:02da Adaptec 5805Z (Voodoo80 BLBU)
66 9005:0285:9005:02db Adaptec 5085G (Voodoo08 PM)
67 9005:0285:9005:02dc Adaptec 51245G (Voodoo124 PM)
68 9005:0285:9005:02dd Adaptec 51645G (Voodoo164 PM)
69 9005:0285:9005:02de Adaptec 52445G (Voodoo244 PM)
70 9005:0285:9005:02df Adaptec ASR-2045G (Voodoo04 Lite PM)
71 9005:0285:9005:02e0 Adaptec ASR-2405G (Voodoo40 Lite PM)
72 9005:0285:9005:02e1 Adaptec ASR-2445G (Voodoo44 Lite PM)
73 9005:0285:9005:02e2 Adaptec ASR-2805G (Voodoo80 Lite PM)
74 1011:0046:9005:0364 Adaptec 5400S (Mustang) 66 1011:0046:9005:0364 Adaptec 5400S (Mustang)
75 1011:0046:9005:0365 Adaptec 5400S (Mustang) 67 1011:0046:9005:0365 Adaptec 5400S (Mustang)
76 9005:0287:9005:0800 Adaptec Themisto (Jupiter) 68 9005:0287:9005:0800 Adaptec Themisto (Jupiter)
@@ -140,6 +132,7 @@ Deanna Bonds (non-DASD support, PAE fibs and 64 bit,
140 where fibs that go to the hardware are consistently called hw_fibs and 132 where fibs that go to the hardware are consistently called hw_fibs and
141 not just fibs like the name of the driver tracking structure) 133 not just fibs like the name of the driver tracking structure)
142Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations. 134Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations.
135Achim Leubner <Achim_Leubner@adaptec.com>
143 136
144Original Driver 137Original Driver
145------------------------- 138-------------------------
diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
1 ====================================
2 SLOW WORK ITEM EXECUTION THREAD POOL
3 ====================================
4
5By: David Howells <dhowells@redhat.com>
6
7The slow work item execution thread pool is a pool of threads for performing
8things that take a relatively long time, such as making mkdir calls.
9Typically, when processing something, these items will spend a lot of time
10blocking a thread on I/O, thus making that thread unavailable for doing other
11work.
12
13The standard workqueue model is unsuitable for this class of work item as that
14limits the owner to a single thread or a single thread per CPU. For some
15tasks, however, more threads - or fewer - are required.
16
17There is just one pool per system. It contains no threads unless something
18wants to use it - and that something must register its interest first. When
19the pool is active, the number of threads it contains is dynamic, varying
20between a maximum and minimum setting, depending on the load.
21
22
23====================
24CLASSES OF WORK ITEM
25====================
26
27This pool support two classes of work items:
28
29 (*) Slow work items.
30
31 (*) Very slow work items.
32
33The former are expected to finish much quicker than the latter.
34
35An operation of the very slow class may do a batch combination of several
36lookups, mkdirs, and a create for instance.
37
38An operation of the ordinarily slow class may, for example, write stuff or
39expand files, provided the time taken to do so isn't too long.
40
41Operations of both types may sleep during execution, thus tying up the thread
42loaned to it.
43
44
45THREAD-TO-CLASS ALLOCATION
46--------------------------
47
48Not all the threads in the pool are available to work on very slow work items.
49The number will be between one and one fewer than the number of active threads.
50This is configurable (see the "Pool Configuration" section).
51
52All the threads are available to work on ordinarily slow work items, but a
53percentage of the threads will prefer to work on very slow work items.
54
55The configuration ensures that at least one thread will be available to work on
56very slow work items, and at least one thread will be available that won't work
57on very slow work items at all.
58
59
60=====================
61USING SLOW WORK ITEMS
62=====================
63
64Firstly, a module or subsystem wanting to make use of slow work items must
65register its interest:
66
67 int ret = slow_work_register_user();
68
69This will return 0 if successful, or a -ve error upon failure.
70
71
72Slow work items may then be set up by:
73
74 (1) Declaring a slow_work struct type variable:
75
76 #include <linux/slow-work.h>
77
78 struct slow_work myitem;
79
80 (2) Declaring the operations to be used for this item:
81
82 struct slow_work_ops myitem_ops = {
83 .get_ref = myitem_get_ref,
84 .put_ref = myitem_put_ref,
85 .execute = myitem_execute,
86 };
87
88 [*] For a description of the ops, see section "Item Operations".
89
90 (3) Initialising the item:
91
92 slow_work_init(&myitem, &myitem_ops);
93
94 or:
95
96 vslow_work_init(&myitem, &myitem_ops);
97
98 depending on its class.
99
100A suitably set up work item can then be enqueued for processing:
101
102 int ret = slow_work_enqueue(&myitem);
103
104This will return a -ve error if the thread pool is unable to gain a reference
105on the item, 0 otherwise.
106
107
108The items are reference counted, so there ought to be no need for a flush
109operation. When all a module's slow work items have been processed, and the
110module has no further interest in the facility, it should unregister its
111interest:
112
113 slow_work_unregister_user();
114
115
116===============
117ITEM OPERATIONS
118===============
119
120Each work item requires a table of operations of type struct slow_work_ops.
121All members are required:
122
123 (*) Get a reference on an item:
124
125 int (*get_ref)(struct slow_work *work);
126
127 This allows the thread pool to attempt to pin an item by getting a
128 reference on it. This function should return 0 if the reference was
129 granted, or a -ve error otherwise. If an error is returned,
130 slow_work_enqueue() will fail.
131
132 The reference is held whilst the item is queued and whilst it is being
133 executed. The item may then be requeued with the same reference held, or
134 the reference will be released.
135
136 (*) Release a reference on an item:
137
138 void (*put_ref)(struct slow_work *work);
139
140 This allows the thread pool to unpin an item by releasing the reference on
141 it. The thread pool will not touch the item again once this has been
142 called.
143
144 (*) Execute an item:
145
146 void (*execute)(struct slow_work *work);
147
148 This should perform the work required of the item. It may sleep, it may
149 perform disk I/O and it may wait for locks.
150
151
152==================
153POOL CONFIGURATION
154==================
155
156The slow-work thread pool has a number of configurables:
157
158 (*) /proc/sys/kernel/slow-work/min-threads
159
160 The minimum number of threads that should be in the pool whilst it is in
161 use. This may be anywhere between 2 and max-threads.
162
163 (*) /proc/sys/kernel/slow-work/max-threads
164
165 The maximum number of threads that should in the pool. This may be
166 anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
167
168 (*) /proc/sys/kernel/slow-work/vslow-percentage
169
170 The percentage of active threads in the pool that may be used to execute
171 very slow work items. This may be between 1 and 99. The resultant number
172 is bounded to between 1 and one fewer than the number of active threads.
173 This ensures there is always at least one thread that can process very
174 slow work items, and always at least one thread that won't.
diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
index a20a9066dc4c..1286f455992f 100644
--- a/Documentation/sysctl/00-INDEX
+++ b/Documentation/sysctl/00-INDEX
@@ -10,6 +10,8 @@ fs.txt
10 - documentation for /proc/sys/fs/*. 10 - documentation for /proc/sys/fs/*.
11kernel.txt 11kernel.txt
12 - documentation for /proc/sys/kernel/*. 12 - documentation for /proc/sys/kernel/*.
13net.txt
14 - documentation for /proc/sys/net/*.
13sunrpc.txt 15sunrpc.txt
14 - documentation for /proc/sys/sunrpc/*. 16 - documentation for /proc/sys/sunrpc/*.
15vm.txt 17vm.txt
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index f99254327ae5..1458448436cc 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -1,5 +1,6 @@
1Documentation for /proc/sys/fs/* kernel version 2.2.10 1Documentation for /proc/sys/fs/* kernel version 2.2.10
2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> 2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
3 (c) 2009, Shen Feng<shen@cn.fujitsu.com>
3 4
4For general info and legal blurb, please look in README. 5For general info and legal blurb, please look in README.
5 6
@@ -14,7 +15,12 @@ kernel. Since some of the files _can_ be used to screw up your
14system, it is advisable to read both documentation and source 15system, it is advisable to read both documentation and source
15before actually making adjustments. 16before actually making adjustments.
16 17
181. /proc/sys/fs
19----------------------------------------------------------
20
17Currently, these files are in /proc/sys/fs: 21Currently, these files are in /proc/sys/fs:
22- aio-max-nr
23- aio-nr
18- dentry-state 24- dentry-state
19- dquot-max 25- dquot-max
20- dquot-nr 26- dquot-nr
@@ -30,8 +36,15 @@ Currently, these files are in /proc/sys/fs:
30- super-max 36- super-max
31- super-nr 37- super-nr
32 38
33Documentation for the files in /proc/sys/fs/binfmt_misc is 39==============================================================
34in Documentation/binfmt_misc.txt. 40
41aio-nr & aio-max-nr:
42
43aio-nr is the running total of the number of events specified on the
44io_setup system call for all currently active aio contexts. If aio-nr
45reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
46raising aio-max-nr does not result in the pre-allocation or re-sizing
47of any kernel data structures.
35 48
36============================================================== 49==============================================================
37 50
@@ -178,3 +191,60 @@ requests. aio-max-nr allows you to change the maximum value
178aio-nr can grow to. 191aio-nr can grow to.
179 192
180============================================================== 193==============================================================
194
195
1962. /proc/sys/fs/binfmt_misc
197----------------------------------------------------------
198
199Documentation for the files in /proc/sys/fs/binfmt_misc is
200in Documentation/binfmt_misc.txt.
201
202
2033. /proc/sys/fs/mqueue - POSIX message queues filesystem
204----------------------------------------------------------
205
206The "mqueue" filesystem provides the necessary kernel features to enable the
207creation of a user space library that implements the POSIX message queues
208API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
209Interfaces specification.)
210
211The "mqueue" filesystem contains values for determining/setting the amount of
212resources used by the file system.
213
214/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
215maximum number of message queues allowed on the system.
216
217/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
218maximum number of messages in a queue value. In fact it is the limiting value
219for another (user) limit which is set in mq_open invocation. This attribute of
220a queue must be less or equal then msg_max.
221
222/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
223maximum message size value (it is every message queue's attribute set during
224its creation).
225
226
2274. /proc/sys/fs/epoll - Configuration options for the epoll interface
228--------------------------------------------------------
229
230This directory contains configuration options for the epoll(7) interface.
231
232max_user_instances
233------------------
234
235This is the maximum number of epoll file descriptors that a single user can
236have open at a given time. The default value is 128, and should be enough
237for normal users.
238
239max_user_watches
240----------------
241
242Every epoll file descriptor can store a number of files to be monitored
243for event readiness. Each one of these monitored files constitutes a "watch".
244This configuration option sets the maximum number of "watches" that are
245allowed for each user.
246Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
247on a 64bit one.
248The current default value for max_user_watches is the 1/32 of the available
249low memory, divided for the "watch" cost in bytes.
250
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a4ccdd1981cf..f11ca7979fa6 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -1,5 +1,6 @@
1Documentation for /proc/sys/kernel/* kernel version 2.2.10 1Documentation for /proc/sys/kernel/* kernel version 2.2.10
2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org> 2 (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
3 (c) 2009, Shen Feng<shen@cn.fujitsu.com>
3 4
4For general info and legal blurb, please look in README. 5For general info and legal blurb, please look in README.
5 6
@@ -18,6 +19,7 @@ Currently, these files might (depending on your configuration)
18show up in /proc/sys/kernel: 19show up in /proc/sys/kernel:
19- acpi_video_flags 20- acpi_video_flags
20- acct 21- acct
22- auto_msgmni
21- core_pattern 23- core_pattern
22- core_uses_pid 24- core_uses_pid
23- ctrl-alt-del 25- ctrl-alt-del
@@ -33,6 +35,7 @@ show up in /proc/sys/kernel:
33- msgmax 35- msgmax
34- msgmnb 36- msgmnb
35- msgmni 37- msgmni
38- nmi_watchdog
36- osrelease 39- osrelease
37- ostype 40- ostype
38- overflowgid 41- overflowgid
@@ -40,6 +43,7 @@ show up in /proc/sys/kernel:
40- panic 43- panic
41- pid_max 44- pid_max
42- powersave-nap [ PPC only ] 45- powersave-nap [ PPC only ]
46- panic_on_unrecovered_nmi
43- printk 47- printk
44- randomize_va_space 48- randomize_va_space
45- real-root-dev ==> Documentation/initrd.txt 49- real-root-dev ==> Documentation/initrd.txt
@@ -55,6 +59,7 @@ show up in /proc/sys/kernel:
55- sysrq ==> Documentation/sysrq.txt 59- sysrq ==> Documentation/sysrq.txt
56- tainted 60- tainted
57- threads-max 61- threads-max
62- unknown_nmi_panic
58- version 63- version
59 64
60============================================================== 65==============================================================
@@ -381,3 +386,51 @@ can be ORed together:
381 512 - A kernel warning has occurred. 386 512 - A kernel warning has occurred.
3821024 - A module from drivers/staging was loaded. 3871024 - A module from drivers/staging was loaded.
383 388
389==============================================================
390
391auto_msgmni:
392
393Enables/Disables automatic recomputing of msgmni upon memory add/remove or
394upon ipc namespace creation/removal (see the msgmni description above).
395Echoing "1" into this file enables msgmni automatic recomputing.
396Echoing "0" turns it off.
397auto_msgmni default value is 1.
398
399==============================================================
400
401nmi_watchdog:
402
403Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
404the NMI watchdog is enabled and will continuously test all online cpus to
405determine whether or not they are still functioning properly. Currently,
406passing "nmi_watchdog=" parameter at boot time is required for this function
407to work.
408
409If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the
410NMI watchdog shares registers with oprofile. By disabling the NMI watchdog,
411oprofile may have more registers to utilize.
412
413==============================================================
414
415unknown_nmi_panic:
416
417The value in this file affects behavior of handling NMI. When the value is
418non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
419debugging information is displayed on console.
420
421NMI switch that most IA32 servers have fires unknown NMI up, for example.
422If a system hangs up, try pressing the NMI switch.
423
424==============================================================
425
426panic_on_unrecovered_nmi:
427
428The default Linux behaviour on an NMI of either memory or unknown is to continue
429operation. For many environments such as scientific computing it is preferable
430that the box is taken out and the error dealt with than an uncorrected
431parity/ECC error get propogated.
432
433A small number of systems do generate NMI's for bizarre random reasons such as
434power management so the default is off. That sysctl works like the existing
435panic controls already in that directory.
436
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
new file mode 100644
index 000000000000..a34d55b65441
--- /dev/null
+++ b/Documentation/sysctl/net.txt
@@ -0,0 +1,175 @@
1Documentation for /proc/sys/net/* kernel version 2.4.0-test11-pre4
2 (c) 1999 Terrehon Bowden <terrehon@pacbell.net>
3 Bodo Bauer <bb@ricochet.net>
4 (c) 2000 Jorge Nerin <comandante@zaralinux.com>
5 (c) 2009 Shen Feng <shen@cn.fujitsu.com>
6
7For general info and legal blurb, please look in README.
8
9==============================================================
10
11This file contains the documentation for the sysctl files in
12/proc/sys/net and is valid for Linux kernel version 2.4.0-test11-pre4.
13
14The interface to the networking parts of the kernel is located in
15/proc/sys/net. The following table shows all possible subdirectories.You may
16see only some of them, depending on your kernel's configuration.
17
18
19Table : Subdirectories in /proc/sys/net
20..............................................................................
21 Directory Content Directory Content
22 core General parameter appletalk Appletalk protocol
23 unix Unix domain sockets netrom NET/ROM
24 802 E802 protocol ax25 AX25
25 ethernet Ethernet protocol rose X.25 PLP layer
26 ipv4 IP version 4 x25 X.25 protocol
27 ipx IPX token-ring IBM token ring
28 bridge Bridging decnet DEC net
29 ipv6 IP version 6
30..............................................................................
31
321. /proc/sys/net/core - Network core options
33-------------------------------------------------------
34
35rmem_default
36------------
37
38The default setting of the socket receive buffer in bytes.
39
40rmem_max
41--------
42
43The maximum receive socket buffer size in bytes.
44
45wmem_default
46------------
47
48The default setting (in bytes) of the socket send buffer.
49
50wmem_max
51--------
52
53The maximum send socket buffer size in bytes.
54
55message_burst and message_cost
56------------------------------
57
58These parameters are used to limit the warning messages written to the kernel
59log from the networking code. They enforce a rate limit to make a
60denial-of-service attack impossible. A higher message_cost factor, results in
61fewer messages that will be written. Message_burst controls when messages will
62be dropped. The default settings limit warning messages to one every five
63seconds.
64
65warnings
66--------
67
68This controls console messages from the networking stack that can occur because
69of problems on the network like duplicate address or bad checksums. Normally,
70this should be enabled, but if the problem persists the messages can be
71disabled.
72
73netdev_budget
74-------------
75
76Maximum number of packets taken from all interfaces in one polling cycle (NAPI
77poll). In one polling cycle interfaces which are registered to polling are
78probed in a round-robin manner. The limit of packets in one such probe can be
79set per-device via sysfs class/net/<device>/weight .
80
81netdev_max_backlog
82------------------
83
84Maximum number of packets, queued on the INPUT side, when the interface
85receives packets faster than kernel can process them.
86
87optmem_max
88----------
89
90Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
91of struct cmsghdr structures with appended data.
92
932. /proc/sys/net/unix - Parameters for Unix domain sockets
94-------------------------------------------------------
95
96There is only one file in this directory.
97unix_dgram_qlen limits the max number of datagrams queued in Unix domain
98socket's buffer. It will not take effect unless PF_UNIX flag is spicified.
99
100
1013. /proc/sys/net/ipv4 - IPV4 settings
102-------------------------------------------------------
103Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
104descriptions of these entries.
105
106
1074. Appletalk
108-------------------------------------------------------
109
110The /proc/sys/net/appletalk directory holds the Appletalk configuration data
111when Appletalk is loaded. The configurable parameters are:
112
113aarp-expiry-time
114----------------
115
116The amount of time we keep an ARP entry before expiring it. Used to age out
117old hosts.
118
119aarp-resolve-time
120-----------------
121
122The amount of time we will spend trying to resolve an Appletalk address.
123
124aarp-retransmit-limit
125---------------------
126
127The number of times we will retransmit a query before giving up.
128
129aarp-tick-time
130--------------
131
132Controls the rate at which expires are checked.
133
134The directory /proc/net/appletalk holds the list of active Appletalk sockets
135on a machine.
136
137The fields indicate the DDP type, the local address (in network:node format)
138the remote address, the size of the transmit pending queue, the size of the
139received queue (bytes waiting for applications to read) the state and the uid
140owning the socket.
141
142/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
143shows the name of the interface, its Appletalk address, the network range on
144that address (or network number for phase 1 networks), and the status of the
145interface.
146
147/proc/net/atalk_route lists each known network route. It lists the target
148(network) that the route leads to, the router (may be directly connected), the
149route flags, and the device the route is using.
150
151
1525. IPX
153-------------------------------------------------------
154
155The IPX protocol has no tunable values in proc/sys/net.
156
157The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
158socket giving the local and remote addresses in Novell format (that is
159network:node:port). In accordance with the strange Novell tradition,
160everything but the port is in hex. Not_Connected is displayed for sockets that
161are not tied to a specific remote address. The Tx and Rx queue sizes indicate
162the number of bytes pending for transmission and reception. The state
163indicates the state the socket is in and the uid is the owning uid of the
164socket.
165
166The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
167it gives the network number, the node number, and indicates if the network is
168the primary network. It also indicates which device it is bound to (or
169Internal for internal networks) and the Frame Type if appropriate. Linux
170supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
171IPX.
172
173The /proc/net/ipx_route table holds a list of IPX routes. For each route it
174gives the destination network, the router node (or Directly) and the network
175address of the router (or Connected) for internal networks.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 9e592c718afb..cf42b820ff9d 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
81 81
82'i' - Send a SIGKILL to all processes, except for init. 82'i' - Send a SIGKILL to all processes, except for init.
83 83
84'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl.
85
84'k' - Secure Access Key (SAK) Kills all programs on the current virtual 86'k' - Secure Access Key (SAK) Kills all programs on the current virtual
85 console. NOTE: See important comments below in SAK section. 87 console. NOTE: See important comments below in SAK section.
86 88
@@ -113,6 +115,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
113 115
114'x' - Used by xmon interface on ppc/powerpc platforms. 116'x' - Used by xmon interface on ppc/powerpc platforms.
115 117
118'z' - Dump the ftrace buffer
119
116'0'-'9' - Sets the console log level, controlling which kernel messages 120'0'-'9' - Sets the console log level, controlling which kernel messages
117 will be printed to your console. ('0', for example would make 121 will be printed to your console. ('0', for example would make
118 it so that only emergency messages like PANICs or OOPSes would 122 it so that only emergency messages like PANICs or OOPSes would
@@ -160,6 +164,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you
160are unable to kill any other way, especially if it's spawning other 164are unable to kill any other way, especially if it's spawning other
161processes. 165processes.
162 166
167"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen
168(probably root) filesystem via the FIFREEZE ioctl.
169
163* Sometimes SysRq seems to get 'stuck' after using it, what can I do? 170* Sometimes SysRq seems to get 'stuck' after using it, what can I do?
164~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 171~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
165That happens to me, also. I've found that tapping shift, alt, and control 172That happens to me, also. I've found that tapping shift, alt, and control
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..c0e1ceed75a4 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
45#include <linux/tracepoint.h> 45#include <linux/tracepoint.h>
46 46
47DECLARE_TRACE(subsys_eventname, 47DECLARE_TRACE(subsys_eventname,
48 TPPROTO(int firstarg, struct task_struct *p), 48 TP_PROTO(int firstarg, struct task_struct *p),
49 TPARGS(firstarg, p)); 49 TP_ARGS(firstarg, p));
50 50
51In subsys/file.c (where the tracing statement must be added) : 51In subsys/file.c (where the tracing statement must be added) :
52 52
@@ -66,10 +66,10 @@ Where :
66 - subsys is the name of your subsystem. 66 - subsys is the name of your subsystem.
67 - eventname is the name of the event to trace. 67 - eventname is the name of the event to trace.
68 68
69- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the 69- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
70 function called by this tracepoint. 70 function called by this tracepoint.
71 71
72- TPARGS(firstarg, p) are the parameters names, same as found in the 72- TP_ARGS(firstarg, p) are the parameters names, same as found in the
73 prototype. 73 prototype.
74 74
75Connecting a function (probe) to a tracepoint is done by providing a 75Connecting a function (probe) to a tracepoint is done by providing a
@@ -103,13 +103,14 @@ used to export the defined tracepoints.
103 103
104* Probe / tracepoint example 104* Probe / tracepoint example
105 105
106See the example provided in samples/tracepoints/src 106See the example provided in samples/tracepoints
107 107
108Compile them with your kernel. 108Compile them with your kernel. They are built during 'make' (not
109'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m.
109 110
110Run, as root : 111Run, as root :
111modprobe tracepoint-example (insmod order is not important) 112modprobe tracepoint-sample (insmod order is not important)
112modprobe tracepoint-probe-example 113modprobe tracepoint-probe-sample
113cat /proc/tracepoint-example (returns an expected error) 114cat /proc/tracepoint-sample (returns an expected error)
114rmmod tracepoint-example tracepoint-probe-example 115rmmod tracepoint-sample tracepoint-probe-sample
115dmesg 116dmesg
diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv
index 0d93fa1ac25e..f11c583295e9 100644
--- a/Documentation/video4linux/CARDLIST.bttv
+++ b/Documentation/video4linux/CARDLIST.bttv
@@ -135,7 +135,7 @@
135134 -> Adlink RTV24 135134 -> Adlink RTV24
136135 -> DViCO FusionHDTV 5 Lite [18ac:d500] 136135 -> DViCO FusionHDTV 5 Lite [18ac:d500]
137136 -> Acorp Y878F [9511:1540] 137136 -> Acorp Y878F [9511:1540]
138137 -> Conceptronic CTVFMi v2 138137 -> Conceptronic CTVFMi v2 [036e:109e]
139138 -> Prolink Pixelview PV-BT878P+ (Rev.2E) 139138 -> Prolink Pixelview PV-BT878P+ (Rev.2E)
140139 -> Prolink PixelView PlayTV MPEG2 PV-M4900 140139 -> Prolink PixelView PlayTV MPEG2 PV-M4900
141140 -> Osprey 440 [0070:ff07] 141140 -> Osprey 440 [0070:ff07]
@@ -154,3 +154,7 @@
154153 -> PHYTEC VD-012 (bt878) 154153 -> PHYTEC VD-012 (bt878)
155154 -> PHYTEC VD-012-X1 (bt878) 155154 -> PHYTEC VD-012-X1 (bt878)
156155 -> PHYTEC VD-012-X2 (bt878) 156155 -> PHYTEC VD-012-X2 (bt878)
157156 -> IVCE-8784 [0000:f050,0001:f050,0002:f050,0003:f050]
158157 -> Geovision GV-800(S) (master) [800a:763d]
159158 -> Geovision GV-800(S) (slave) [800b:763d,800c:763d,800d:763d]
160159 -> ProVideo PV183 [1830:1540,1831:1540,1832:1540,1833:1540,1834:1540,1835:1540,1836:1540,1837:1540]
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 35ea130e9898..91aa3c0f0dd2 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -12,3 +12,7 @@
12 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78] 12 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78]
13 12 -> Leadtek Winfast PxDVR3200 H [107d:6681] 13 12 -> Leadtek Winfast PxDVR3200 H [107d:6681]
14 13 -> Compro VideoMate E650F [185b:e800] 14 13 -> Compro VideoMate E650F [185b:e800]
15 14 -> TurboSight TBS 6920 [6920:8888]
16 15 -> TeVii S470 [d470:9022]
17 16 -> DVBWorld DVB-S2 2005 [0001:2005]
18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 0d08f1edcf6d..71e9db0b26f7 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -77,3 +77,4 @@
77 76 -> SATTRADE ST4200 DVB-S/S2 [b200:4200] 77 76 -> SATTRADE ST4200 DVB-S/S2 [b200:4200]
78 77 -> TBS 8910 DVB-S [8910:8888] 78 77 -> TBS 8910 DVB-S [8910:8888]
79 78 -> Prof 6200 DVB-S [b022:3022] 79 78 -> Prof 6200 DVB-S [b022:3022]
80 79 -> Terratec Cinergy HT PCI MKII [153b:1177]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 75bded8a4aa2..78d0a6eed571 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -7,12 +7,12 @@
7 6 -> Terratec Cinergy 200 USB (em2800) 7 6 -> Terratec Cinergy 200 USB (em2800)
8 7 -> Leadtek Winfast USB II (em2800) [0413:6023] 8 7 -> Leadtek Winfast USB II (em2800) [0413:6023]
9 8 -> Kworld USB2800 (em2800) 9 8 -> Kworld USB2800 (em2800)
10 9 -> Pinnacle Dazzle DVC 90/DVC 100 (em2820/em2840) [2304:0207,2304:021a] 10 9 -> Pinnacle Dazzle DVC 90/100/101/107 / Kaiser Baas Video to DVD maker (em2820/em2840) [1b80:e302,2304:0207,2304:021a]
11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500] 11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500]
12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042] 12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042]
13 12 -> Kworld PVR TV 2800 RF (em2820/em2840) 13 12 -> Kworld PVR TV 2800 RF (em2820/em2840)
14 13 -> Terratec Prodigy XS (em2880) [0ccd:0047] 14 13 -> Terratec Prodigy XS (em2880) [0ccd:0047]
15 14 -> Pixelview Prolink PlayTV USB 2.0 (em2820/em2840) 15 14 -> SIIG AVTuner-PVR / Pixelview Prolink PlayTV USB 2.0 (em2820/em2840)
16 15 -> V-Gear PocketTV (em2800) 16 15 -> V-Gear PocketTV (em2800)
17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b]
18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
@@ -30,7 +30,6 @@
30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) 30 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840)
31 31 -> Usbgear VD204v9 (em2821) 31 31 -> Usbgear VD204v9 (em2821)
32 32 -> Supercomp USB 2.0 TV (em2821) 32 32 -> Supercomp USB 2.0 TV (em2821)
33 33 -> SIIG AVTuner-PVR/Prolink PlayTV USB 2.0 (em2821)
34 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f] 33 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f]
35 35 -> Typhoon DVD Maker (em2860) 34 35 -> Typhoon DVD Maker (em2860)
36 36 -> NetGMBH Cam (em2860) 35 36 -> NetGMBH Cam (em2860)
@@ -58,3 +57,7 @@
58 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041] 57 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041]
59 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f] 58 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f]
60 61 -> Pixelview PlayTV Box 4 USB 2.0 (em2820/em2840) 59 61 -> Pixelview PlayTV Box 4 USB 2.0 (em2820/em2840)
60 62 -> Gadmei TVR200 (em2820/em2840)
61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303]
62 64 -> Easy Cap Capture DC-60 (em2860)
63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515]
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index b8d470596b0c..6dacf2825259 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -153,3 +153,5 @@
153152 -> Asus Tiger Rev:1.00 [1043:4857] 153152 -> Asus Tiger Rev:1.00 [1043:4857]
154153 -> Kworld Plus TV Analog Lite PCI [17de:7128] 154153 -> Kworld Plus TV Analog Lite PCI [17de:7128]
155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] 155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d]
156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708]
157156 -> Hauppauge WinTV-HVR1110r3 [0070:6707,0070:6709,0070:670a]
diff --git a/Documentation/video4linux/Zoran b/Documentation/video4linux/Zoran
index 295462b2317a..0e89e7676298 100644
--- a/Documentation/video4linux/Zoran
+++ b/Documentation/video4linux/Zoran
@@ -401,8 +401,7 @@ Additional notes for software developers:
401 first set the correct norm. Well, it seems logically correct: TV 401 first set the correct norm. Well, it seems logically correct: TV
402 standard is "more constant" for current country than geometry 402 standard is "more constant" for current country than geometry
403 settings of a variety of TV capture cards which may work in ITU or 403 settings of a variety of TV capture cards which may work in ITU or
404 square pixel format. Remember that users now can lock the norm to 404 square pixel format.
405 avoid any ambiguity.
406-- 405--
407Please note that lavplay/lavrec are also included in the MJPEG-tools 406Please note that lavplay/lavrec are also included in the MJPEG-tools
408(http://mjpeg.sf.net/). 407(http://mjpeg.sf.net/).
diff --git a/Documentation/video4linux/bttv/Insmod-options b/Documentation/video4linux/bttv/Insmod-options
index 5ef75787f83a..bbe3ed667d91 100644
--- a/Documentation/video4linux/bttv/Insmod-options
+++ b/Documentation/video4linux/bttv/Insmod-options
@@ -81,16 +81,6 @@ tuner.o
81 pal=[bdgil] select PAL variant (used for some tuners 81 pal=[bdgil] select PAL variant (used for some tuners
82 only, important for the audio carrier). 82 only, important for the audio carrier).
83 83
84tvmixer.o
85 registers a mixer device for the TV card's volume/bass/treble
86 controls (requires a i2c audio control chip like the msp3400).
87
88 insmod args:
89 debug=1 print some debug info to the syslog.
90 devnr=n allocate device #n (0 == /dev/mixer,
91 1 = /dev/mixer1, ...), default is to
92 use the first free one.
93
94tvaudio.o 84tvaudio.o
95 new, experimental module which is supported to provide a single 85 new, experimental module which is supported to provide a single
96 driver for all simple i2c audio control chips (tda/tea*). 86 driver for all simple i2c audio control chips (tda/tea*).
diff --git a/Documentation/video4linux/bttv/README b/Documentation/video4linux/bttv/README
index 7ca2154c2bf5..3a367cdb664e 100644
--- a/Documentation/video4linux/bttv/README
+++ b/Documentation/video4linux/bttv/README
@@ -63,8 +63,8 @@ If you have some knowledge and spare time, please try to fix this
63yourself (patches very welcome of course...) You know: The linux 63yourself (patches very welcome of course...) You know: The linux
64slogan is "Do it yourself". 64slogan is "Do it yourself".
65 65
66There is a mailing list: video4linux-list@redhat.com. 66There is a mailing list: linux-media@vger.kernel.org
67https://listman.redhat.com/mailman/listinfo/video4linux-list 67http://vger.kernel.org/vger-lists.html#linux-media
68 68
69If you have trouble with some specific TV card, try to ask there 69If you have trouble with some specific TV card, try to ask there
70instead of mailing me directly. The chance that someone with the 70instead of mailing me directly. The chance that someone with the
diff --git a/Documentation/video4linux/cx2341x/README.hm12 b/Documentation/video4linux/cx2341x/README.hm12
index 0e213ed095e6..b36148ea0750 100644
--- a/Documentation/video4linux/cx2341x/README.hm12
+++ b/Documentation/video4linux/cx2341x/README.hm12
@@ -32,6 +32,10 @@ Y, U and V planes. This code assumes frames of 720x576 (PAL) pixels.
32The width of a frame is always 720 pixels, regardless of the actual specified 32The width of a frame is always 720 pixels, regardless of the actual specified
33width. 33width.
34 34
35If the height is not a multiple of 32 lines, then the captured video is
36missing macroblocks at the end and is unusable. So the height must be a
37multiple of 32.
38
35-------------------------------------------------------------------------- 39--------------------------------------------------------------------------
36 40
37#include <stdio.h> 41#include <stdio.h>
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 1c58a7630146..98529e03a46e 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -32,6 +32,7 @@ spca561 041e:403b Creative Webcam Vista (VF0010)
32zc3xx 041e:4051 Creative Live!Cam Notebook Pro (VF0250) 32zc3xx 041e:4051 Creative Live!Cam Notebook Pro (VF0250)
33ov519 041e:4052 Creative Live! VISTA IM 33ov519 041e:4052 Creative Live! VISTA IM
34zc3xx 041e:4053 Creative Live!Cam Video IM 34zc3xx 041e:4053 Creative Live!Cam Video IM
35vc032x 041e:405b Creative Live! Cam Notebook Ultra (VC0130)
35ov519 041e:405f Creative Live! VISTA VF0330 36ov519 041e:405f Creative Live! VISTA VF0330
36ov519 041e:4060 Creative Live! VISTA VF0350 37ov519 041e:4060 Creative Live! VISTA VF0350
37ov519 041e:4061 Creative Live! VISTA VF0400 38ov519 041e:4061 Creative Live! VISTA VF0400
@@ -193,6 +194,7 @@ spca500 084d:0003 D-Link DSC-350
193spca500 08ca:0103 Aiptek PocketDV 194spca500 08ca:0103 Aiptek PocketDV
194sunplus 08ca:0104 Aiptek PocketDVII 1.3 195sunplus 08ca:0104 Aiptek PocketDVII 1.3
195sunplus 08ca:0106 Aiptek Pocket DV3100+ 196sunplus 08ca:0106 Aiptek Pocket DV3100+
197mr97310a 08ca:0111 Aiptek PenCam VGA+
196sunplus 08ca:2008 Aiptek Mini PenCam 2 M 198sunplus 08ca:2008 Aiptek Mini PenCam 2 M
197sunplus 08ca:2010 Aiptek PocketCam 3M 199sunplus 08ca:2010 Aiptek PocketCam 3M
198sunplus 08ca:2016 Aiptek PocketCam 2 Mega 200sunplus 08ca:2016 Aiptek PocketCam 2 Mega
@@ -215,6 +217,7 @@ pac207 093a:2468 PAC207
215pac207 093a:2470 Genius GF112 217pac207 093a:2470 Genius GF112
216pac207 093a:2471 Genius VideoCam ge111 218pac207 093a:2471 Genius VideoCam ge111
217pac207 093a:2472 Genius VideoCam ge110 219pac207 093a:2472 Genius VideoCam ge110
220pac207 093a:2474 Genius iLook 111
218pac207 093a:2476 Genius e-Messenger 112 221pac207 093a:2476 Genius e-Messenger 112
219pac7311 093a:2600 PAC7311 Typhoon 222pac7311 093a:2600 PAC7311 Typhoon
220pac7311 093a:2601 Philips SPC 610 NC 223pac7311 093a:2601 Philips SPC 610 NC
@@ -279,6 +282,7 @@ spca561 10fd:7e50 FlyCam Usb 100
279zc3xx 10fd:8050 Typhoon Webshot II USB 300k 282zc3xx 10fd:8050 Typhoon Webshot II USB 300k
280ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201) 283ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201)
281pac207 145f:013a Trust WB-1300N 284pac207 145f:013a Trust WB-1300N
285vc032x 15b8:6001 HP 2.0 Megapixel
282vc032x 15b8:6002 HP 2.0 Megapixel rz406aa 286vc032x 15b8:6002 HP 2.0 Megapixel rz406aa
283spca501 1776:501c Arowana 300K CMOS Camera 287spca501 1776:501c Arowana 300K CMOS Camera
284t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops 288t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops
diff --git a/Documentation/video4linux/si470x.txt b/Documentation/video4linux/si470x.txt
index 49679e6aaa76..3a7823e01b4d 100644
--- a/Documentation/video4linux/si470x.txt
+++ b/Documentation/video4linux/si470x.txt
@@ -1,6 +1,6 @@
1Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers 1Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers
2 2
3Copyright (c) 2008 Tobias Lorenz <tobias.lorenz@gmx.net> 3Copyright (c) 2009 Tobias Lorenz <tobias.lorenz@gmx.net>
4 4
5 5
6Information from Silicon Labs 6Information from Silicon Labs
@@ -41,7 +41,7 @@ chips are known to work:
41- 10c4:818a: Silicon Labs USB FM Radio Reference Design 41- 10c4:818a: Silicon Labs USB FM Radio Reference Design
42- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF) 42- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF)
43- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700) 43- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700)
44- 10c5:819a: DealExtreme USB Radio 44- 10c5:819a: Sanei Electric, Inc. FM USB Radio (sold as DealExtreme.com PCear)
45 45
46 46
47Software 47Software
@@ -52,6 +52,7 @@ Testing is usually done with most application under Debian/testing:
52- gradio - GTK FM radio tuner 52- gradio - GTK FM radio tuner
53- kradio - Comfortable Radio Application for KDE 53- kradio - Comfortable Radio Application for KDE
54- radio - ncurses-based radio application 54- radio - ncurses-based radio application
55- mplayer - The Ultimate Movie Player For Linux
55 56
56There is also a library libv4l, which can be used. It's going to have a function 57There is also a library libv4l, which can be used. It's going to have a function
57for frequency seeking, either by using hardware functionality as in radio-si470x 58for frequency seeking, either by using hardware functionality as in radio-si470x
@@ -69,7 +70,7 @@ Audio Listing
69USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to 70USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to
70also select SND_USB_AUDIO, as this is required to get sound from the radio. For 71also select SND_USB_AUDIO, as this is required to get sound from the radio. For
71listing you have to redirect the sound, for example using one of the following 72listing you have to redirect the sound, for example using one of the following
72commands. 73commands. Please adjust the audio devices to your needs (/dev/dsp* and hw:x,x).
73 74
74If you just want to test audio (very poor quality): 75If you just want to test audio (very poor quality):
75cat /dev/dsp1 > /dev/dsp 76cat /dev/dsp1 > /dev/dsp
@@ -80,6 +81,10 @@ sox -2 --endian little -r 96000 -t oss /dev/dsp1 -t oss /dev/dsp
80If you use arts try: 81If you use arts try:
81arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B - 82arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B -
82 83
84If you use mplayer try:
85mplayer -radio adevice=hw=1.0:arate=96000 \
86 -rawaudio rate=96000 \
87 radio://<frequency>/capture
83 88
84Module Parameters 89Module Parameters
85================= 90=================
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index ff124374e9ba..a31177390e55 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -47,7 +47,9 @@ All drivers have the following structure:
473) Creating V4L2 device nodes (/dev/videoX, /dev/vbiX, /dev/radioX and 473) Creating V4L2 device nodes (/dev/videoX, /dev/vbiX, /dev/radioX and
48 /dev/vtxX) and keeping track of device-node specific data. 48 /dev/vtxX) and keeping track of device-node specific data.
49 49
504) Filehandle-specific structs containing per-filehandle data. 504) Filehandle-specific structs containing per-filehandle data;
51
525) video buffer handling.
51 53
52This is a rough schematic of how it all relates: 54This is a rough schematic of how it all relates:
53 55
@@ -82,12 +84,20 @@ You must register the device instance:
82 v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); 84 v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev);
83 85
84Registration will initialize the v4l2_device struct and link dev->driver_data 86Registration will initialize the v4l2_device struct and link dev->driver_data
85to v4l2_dev. Registration will also set v4l2_dev->name to a value derived from 87to v4l2_dev. If v4l2_dev->name is empty then it will be set to a value derived
86dev (driver name followed by the bus_id, to be precise). You may change the 88from dev (driver name followed by the bus_id, to be precise). If you set it
87name after registration if you want. 89up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
88 91
89The first 'dev' argument is normally the struct device pointer of a pci_dev, 92The first 'dev' argument is normally the struct device pointer of a pci_dev,
90usb_device or platform_device. 93usb_device or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making
95it impossible to associate v4l2_dev with a particular parent.
96
97You can also supply a notify() callback that can be called by sub-devices to
98notify you of events. Whether you need to set this depends on the sub-device.
99Any notifications a sub-device supports must be defined in a header in
100include/media/<subdevice>.h.
91 101
92You unregister with: 102You unregister with:
93 103
@@ -95,6 +105,17 @@ You unregister with:
95 105
96Unregistering will also automatically unregister all subdevs from the device. 106Unregistering will also automatically unregister all subdevs from the device.
97 107
108If you have a hotpluggable device (e.g. a USB device), then when a disconnect
109happens the parent device becomes invalid. Since v4l2_device has a pointer to
110that parent device it has to be cleared as well to mark that the parent is
111gone. To do this call:
112
113 v4l2_device_disconnect(struct v4l2_device *v4l2_dev);
114
115This does *not* unregister the subdevs, so you still need to call the
116v4l2_device_unregister() function for that. If your driver is not hotpluggable,
117then there is no need to call v4l2_device_disconnect().
118
98Sometimes you need to iterate over all devices registered by a specific 119Sometimes you need to iterate over all devices registered by a specific
99driver. This is usually the case if multiple device drivers use the same 120driver. This is usually the case if multiple device drivers use the same
100hardware. E.g. the ivtvfb driver is a framebuffer driver that uses the ivtv 121hardware. E.g. the ivtvfb driver is a framebuffer driver that uses the ivtv
@@ -134,7 +155,7 @@ The recommended approach is as follows:
134 155
135static atomic_t drv_instance = ATOMIC_INIT(0); 156static atomic_t drv_instance = ATOMIC_INIT(0);
136 157
137static int __devinit drv_probe(struct pci_dev *dev, 158static int __devinit drv_probe(struct pci_dev *pdev,
138 const struct pci_device_id *pci_id) 159 const struct pci_device_id *pci_id)
139{ 160{
140 ... 161 ...
@@ -218,7 +239,7 @@ to add new ops and categories.
218 239
219A sub-device driver initializes the v4l2_subdev struct using: 240A sub-device driver initializes the v4l2_subdev struct using:
220 241
221 v4l2_subdev_init(subdev, &ops); 242 v4l2_subdev_init(sd, &ops);
222 243
223Afterwards you need to initialize subdev->name with a unique name and set the 244Afterwards you need to initialize subdev->name with a unique name and set the
224module owner. This is done for you if you use the i2c helper functions. 245module owner. This is done for you if you use the i2c helper functions.
@@ -226,7 +247,7 @@ module owner. This is done for you if you use the i2c helper functions.
226A device (bridge) driver needs to register the v4l2_subdev with the 247A device (bridge) driver needs to register the v4l2_subdev with the
227v4l2_device: 248v4l2_device:
228 249
229 int err = v4l2_device_register_subdev(device, subdev); 250 int err = v4l2_device_register_subdev(v4l2_dev, sd);
230 251
231This can fail if the subdev module disappeared before it could be registered. 252This can fail if the subdev module disappeared before it could be registered.
232After this function was called successfully the subdev->dev field points to 253After this function was called successfully the subdev->dev field points to
@@ -234,17 +255,17 @@ the v4l2_device.
234 255
235You can unregister a sub-device using: 256You can unregister a sub-device using:
236 257
237 v4l2_device_unregister_subdev(subdev); 258 v4l2_device_unregister_subdev(sd);
238 259
239Afterwards the subdev module can be unloaded and subdev->dev == NULL. 260Afterwards the subdev module can be unloaded and sd->dev == NULL.
240 261
241You can call an ops function either directly: 262You can call an ops function either directly:
242 263
243 err = subdev->ops->core->g_chip_ident(subdev, &chip); 264 err = sd->ops->core->g_chip_ident(sd, &chip);
244 265
245but it is better and easier to use this macro: 266but it is better and easier to use this macro:
246 267
247 err = v4l2_subdev_call(subdev, core, g_chip_ident, &chip); 268 err = v4l2_subdev_call(sd, core, g_chip_ident, &chip);
248 269
249The macro will to the right NULL pointer checks and returns -ENODEV if subdev 270The macro will to the right NULL pointer checks and returns -ENODEV if subdev
250is NULL, -ENOIOCTLCMD if either subdev->core or subdev->core->g_chip_ident is 271is NULL, -ENOIOCTLCMD if either subdev->core or subdev->core->g_chip_ident is
@@ -252,19 +273,19 @@ NULL, or the actual result of the subdev->ops->core->g_chip_ident ops.
252 273
253It is also possible to call all or a subset of the sub-devices: 274It is also possible to call all or a subset of the sub-devices:
254 275
255 v4l2_device_call_all(dev, 0, core, g_chip_ident, &chip); 276 v4l2_device_call_all(v4l2_dev, 0, core, g_chip_ident, &chip);
256 277
257Any subdev that does not support this ops is skipped and error results are 278Any subdev that does not support this ops is skipped and error results are
258ignored. If you want to check for errors use this: 279ignored. If you want to check for errors use this:
259 280
260 err = v4l2_device_call_until_err(dev, 0, core, g_chip_ident, &chip); 281 err = v4l2_device_call_until_err(v4l2_dev, 0, core, g_chip_ident, &chip);
261 282
262Any error except -ENOIOCTLCMD will exit the loop with that error. If no 283Any error except -ENOIOCTLCMD will exit the loop with that error. If no
263errors (except -ENOIOCTLCMD) occured, then 0 is returned. 284errors (except -ENOIOCTLCMD) occured, then 0 is returned.
264 285
265The second argument to both calls is a group ID. If 0, then all subdevs are 286The second argument to both calls is a group ID. If 0, then all subdevs are
266called. If non-zero, then only those whose group ID match that value will 287called. If non-zero, then only those whose group ID match that value will
267be called. Before a bridge driver registers a subdev it can set subdev->grp_id 288be called. Before a bridge driver registers a subdev it can set sd->grp_id
268to whatever value it wants (it's 0 by default). This value is owned by the 289to whatever value it wants (it's 0 by default). This value is owned by the
269bridge driver and the sub-device driver will never modify or use it. 290bridge driver and the sub-device driver will never modify or use it.
270 291
@@ -276,6 +297,11 @@ e.g. AUDIO_CONTROLLER and specify that as the group ID value when calling
276v4l2_device_call_all(). That ensures that it will only go to the subdev 297v4l2_device_call_all(). That ensures that it will only go to the subdev
277that needs it. 298that needs it.
278 299
300If the sub-device needs to notify its v4l2_device parent of an event, then
301it can call v4l2_subdev_notify(sd, notification, arg). This macro checks
302whether there is a notify() callback defined and returns -ENODEV if not.
303Otherwise the result of the notify() call is returned.
304
279The advantage of using v4l2_subdev is that it is a generic struct and does 305The advantage of using v4l2_subdev is that it is a generic struct and does
280not contain any knowledge about the underlying hardware. So a driver might 306not contain any knowledge about the underlying hardware. So a driver might
281contain several subdevs that use an I2C bus, but also a subdev that is 307contain several subdevs that use an I2C bus, but also a subdev that is
@@ -340,6 +366,12 @@ Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback
340is called. This will unregister the sub-device from the bridge driver. It is 366is called. This will unregister the sub-device from the bridge driver. It is
341safe to call this even if the sub-device was never registered. 367safe to call this even if the sub-device was never registered.
342 368
369You need to do this because when the bridge driver destroys the i2c adapter
370the remove() callbacks are called of the i2c devices on that adapter.
371After that the corresponding v4l2_subdev structures are invalid, so they
372have to be unregistered first. Calling v4l2_device_unregister_subdev(sd)
373from the remove() callback ensures that this is always done correctly.
374
343 375
344The bridge driver also has some helper functions it can use: 376The bridge driver also has some helper functions it can use:
345 377
@@ -349,8 +381,8 @@ This loads the given module (can be NULL if no module needs to be loaded) and
349calls i2c_new_device() with the given i2c_adapter and chip/address arguments. 381calls i2c_new_device() with the given i2c_adapter and chip/address arguments.
350If all goes well, then it registers the subdev with the v4l2_device. It gets 382If all goes well, then it registers the subdev with the v4l2_device. It gets
351the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure 383the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure
352that adapdata is set to v4l2_device when you setup the i2c_adapter in your 384to call i2c_set_adapdata(adapter, v4l2_device) when you setup the i2c_adapter
353driver. 385in your driver.
354 386
355You can also use v4l2_i2c_new_probed_subdev() which is very similar to 387You can also use v4l2_i2c_new_probed_subdev() which is very similar to
356v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses 388v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses
@@ -358,6 +390,14 @@ that it should probe. Internally it calls i2c_new_probed_device().
358 390
359Both functions return NULL if something went wrong. 391Both functions return NULL if something went wrong.
360 392
393Note that the chipid you pass to v4l2_i2c_new_(probed_)subdev() is usually
394the same as the module name. It allows you to specify a chip variant, e.g.
395"saa7114" or "saa7115". In general though the i2c driver autodetects this.
396The use of chipid is something that needs to be looked at more closely at a
397later date. It differs between i2c drivers and as such can be confusing.
398To see which chip variants are supported you can look in the i2c driver code
399for the i2c_device_id table. This lists all the possibilities.
400
361 401
362struct video_device 402struct video_device
363------------------- 403-------------------
@@ -396,6 +436,15 @@ You should also set these fields:
396- ioctl_ops: if you use the v4l2_ioctl_ops to simplify ioctl maintenance 436- ioctl_ops: if you use the v4l2_ioctl_ops to simplify ioctl maintenance
397 (highly recommended to use this and it might become compulsory in the 437 (highly recommended to use this and it might become compulsory in the
398 future!), then set this to your v4l2_ioctl_ops struct. 438 future!), then set this to your v4l2_ioctl_ops struct.
439- parent: you only set this if v4l2_device was registered with NULL as
440 the parent device struct. This only happens in cases where one hardware
441 device has multiple PCI devices that all share the same v4l2_device core.
442
443 The cx88 driver is an example of this: one core v4l2_device struct, but
444 it is used by both an raw video PCI device (cx8800) and a MPEG PCI device
445 (cx8802). Since the v4l2_device cannot be associated with a particular
446 PCI device it is setup without a parent device. But when the struct
447 video_device is setup you do know which parent PCI device to use.
399 448
400If you use v4l2_ioctl_ops, then you should set either .unlocked_ioctl or 449If you use v4l2_ioctl_ops, then you should set either .unlocked_ioctl or
401.ioctl to video_ioctl2 in your v4l2_file_operations struct. 450.ioctl to video_ioctl2 in your v4l2_file_operations struct.
@@ -499,8 +548,8 @@ There are a few useful helper functions:
499 548
500You can set/get driver private data in the video_device struct using: 549You can set/get driver private data in the video_device struct using:
501 550
502void *video_get_drvdata(struct video_device *dev); 551void *video_get_drvdata(struct video_device *vdev);
503void video_set_drvdata(struct video_device *dev, void *data); 552void video_set_drvdata(struct video_device *vdev, void *data);
504 553
505Note that you can safely call video_set_drvdata() before calling 554Note that you can safely call video_set_drvdata() before calling
506video_register_device(). 555video_register_device().
@@ -519,3 +568,103 @@ void *video_drvdata(struct file *file);
519You can go from a video_device struct to the v4l2_device struct using: 568You can go from a video_device struct to the v4l2_device struct using:
520 569
521struct v4l2_device *v4l2_dev = vdev->v4l2_dev; 570struct v4l2_device *v4l2_dev = vdev->v4l2_dev;
571
572video buffer helper functions
573-----------------------------
574
575The v4l2 core API provides a standard method for dealing with video
576buffers. Those methods allow a driver to implement read(), mmap() and
577overlay() on a consistent way.
578
579There are currently methods for using video buffers on devices that
580supports DMA with scatter/gather method (videobuf-dma-sg), DMA with
581linear access (videobuf-dma-contig), and vmalloced buffers, mostly
582used on USB drivers (videobuf-vmalloc).
583
584Any driver using videobuf should provide operations (callbacks) for
585four handlers:
586
587ops->buf_setup - calculates the size of the video buffers and avoid they
588 to waste more than some maximum limit of RAM;
589ops->buf_prepare - fills the video buffer structs and calls
590 videobuf_iolock() to alloc and prepare mmaped memory;
591ops->buf_queue - advices the driver that another buffer were
592 requested (by read() or by QBUF);
593ops->buf_release - frees any buffer that were allocated.
594
595In order to use it, the driver need to have a code (generally called at
596interrupt context) that will properly handle the buffer request lists,
597announcing that a new buffer were filled.
598
599The irq handling code should handle the videobuf task lists, in order
600to advice videobuf that a new frame were filled, in order to honor to a
601request. The code is generally like this one:
602 if (list_empty(&dma_q->active))
603 return;
604
605 buf = list_entry(dma_q->active.next, struct vbuffer, vb.queue);
606
607 if (!waitqueue_active(&buf->vb.done))
608 return;
609
610 /* Some logic to handle the buf may be needed here */
611
612 list_del(&buf->vb.queue);
613 do_gettimeofday(&buf->vb.ts);
614 wake_up(&buf->vb.done);
615
616Those are the videobuffer functions used on drivers, implemented on
617videobuf-core:
618
619- Videobuf init functions
620 videobuf_queue_sg_init()
621 Initializes the videobuf infrastructure. This function should be
622 called before any other videobuf function on drivers that uses DMA
623 Scatter/Gather buffers.
624
625 videobuf_queue_dma_contig_init
626 Initializes the videobuf infrastructure. This function should be
627 called before any other videobuf function on drivers that need DMA
628 contiguous buffers.
629
630 videobuf_queue_vmalloc_init()
631 Initializes the videobuf infrastructure. This function should be
632 called before any other videobuf function on USB (and other drivers)
633 that need a vmalloced type of videobuf.
634
635- videobuf_iolock()
636 Prepares the videobuf memory for the proper method (read, mmap, overlay).
637
638- videobuf_queue_is_busy()
639 Checks if a videobuf is streaming.
640
641- videobuf_queue_cancel()
642 Stops video handling.
643
644- videobuf_mmap_free()
645 frees mmap buffers.
646
647- videobuf_stop()
648 Stops video handling, ends mmap and frees mmap and other buffers.
649
650- V4L2 api functions. Those functions correspond to VIDIOC_foo ioctls:
651 videobuf_reqbufs(), videobuf_querybuf(), videobuf_qbuf(),
652 videobuf_dqbuf(), videobuf_streamon(), videobuf_streamoff().
653
654- V4L1 api function (corresponds to VIDIOCMBUF ioctl):
655 videobuf_cgmbuf()
656 This function is used to provide backward compatibility with V4L1
657 API.
658
659- Some help functions for read()/poll() operations:
660 videobuf_read_stream()
661 For continuous stream read()
662 videobuf_read_one()
663 For snapshot read()
664 videobuf_poll_stream()
665 polling help function
666
667The better way to understand it is to take a look at vivi driver. One
668of the main reasons for vivi is to be a videobuf usage example. the
669vivi_thread_tick() does the task that the IRQ callback would do on PCI
670drivers (or the irq callback on USB).
diff --git a/Documentation/video4linux/v4lgrab.c b/Documentation/video4linux/v4lgrab.c
index d6e70bef8ad0..05769cff1009 100644
--- a/Documentation/video4linux/v4lgrab.c
+++ b/Documentation/video4linux/v4lgrab.c
@@ -105,8 +105,8 @@ int main(int argc, char ** argv)
105 struct video_picture vpic; 105 struct video_picture vpic;
106 106
107 unsigned char *buffer, *src; 107 unsigned char *buffer, *src;
108 int bpp = 24, r, g, b; 108 int bpp = 24, r = 0, g = 0, b = 0;
109 unsigned int i, src_depth; 109 unsigned int i, src_depth = 16;
110 110
111 if (fd < 0) { 111 if (fd < 0) {
112 perror(VIDEO_DEV); 112 perror(VIDEO_DEV);
diff --git a/Documentation/video4linux/zr364xx.txt b/Documentation/video4linux/zr364xx.txt
index 5c81e3ae6458..7f3d1955d214 100644
--- a/Documentation/video4linux/zr364xx.txt
+++ b/Documentation/video4linux/zr364xx.txt
@@ -65,3 +65,4 @@ Vendor Product Distributor Model
650x06d6 0x003b Trust Powerc@m 970Z 650x06d6 0x003b Trust Powerc@m 970Z
660x0a17 0x004e Pentax Optio 50 660x0a17 0x004e Pentax Optio 50
670x041e 0x405d Creative DiVi CAM 516 670x041e 0x405d Creative DiVi CAM 516
680x08ca 0x2102 Aiptek DV T300
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt
new file mode 100644
index 000000000000..a956d9b7f943
--- /dev/null
+++ b/Documentation/vm/kmemtrace.txt
@@ -0,0 +1,126 @@
1 kmemtrace - Kernel Memory Tracer
2
3 by Eduard - Gabriel Munteanu
4 <eduard.munteanu@linux360.ro>
5
6I. Introduction
7===============
8
9kmemtrace helps kernel developers figure out two things:
101) how different allocators (SLAB, SLUB etc.) perform
112) how kernel code allocates memory and how much
12
13To do this, we trace every allocation and export information to the userspace
14through the relay interface. We export things such as the number of requested
15bytes, the number of bytes actually allocated (i.e. including internal
16fragmentation), whether this is a slab allocation or a plain kmalloc() and so
17on.
18
19The actual analysis is performed by a userspace tool (see section III for
20details on where to get it from). It logs the data exported by the kernel,
21processes it and (as of writing this) can provide the following information:
22- the total amount of memory allocated and fragmentation per call-site
23- the amount of memory allocated and fragmentation per allocation
24- total memory allocated and fragmentation in the collected dataset
25- number of cross-CPU allocation and frees (makes sense in NUMA environments)
26
27Moreover, it can potentially find inconsistent and erroneous behavior in
28kernel code, such as using slab free functions on kmalloc'ed memory or
29allocating less memory than requested (but not truly failed allocations).
30
31kmemtrace also makes provisions for tracing on some arch and analysing the
32data on another.
33
34II. Design and goals
35====================
36
37kmemtrace was designed to handle rather large amounts of data. Thus, it uses
38the relay interface to export whatever is logged to userspace, which then
39stores it. Analysis and reporting is done asynchronously, that is, after the
40data is collected and stored. By design, it allows one to log and analyse
41on different machines and different arches.
42
43As of writing this, the ABI is not considered stable, though it might not
44change much. However, no guarantees are made about compatibility yet. When
45deemed stable, the ABI should still allow easy extension while maintaining
46backward compatibility. This is described further in Documentation/ABI.
47
48Summary of design goals:
49 - allow logging and analysis to be done across different machines
50 - be fast and anticipate usage in high-load environments (*)
51 - be reasonably extensible
52 - make it possible for GNU/Linux distributions to have kmemtrace
53 included in their repositories
54
55(*) - one of the reasons Pekka Enberg's original userspace data analysis
56 tool's code was rewritten from Perl to C (although this is more than a
57 simple conversion)
58
59
60III. Quick usage guide
61======================
62
631) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
64CONFIG_KMEMTRACE).
65
662) Get the userspace tool and build it:
67$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository
68$ cd kmemtrace-user/
69$ ./autogen.sh
70$ ./configure
71$ make
72
733) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
74'single' runlevel (so that relay buffers don't fill up easily), and run
75kmemtrace:
76# '$' does not mean user, but root here.
77$ mount -t debugfs none /sys/kernel/debug
78$ mount -t proc none /proc
79$ cd path/to/kmemtrace-user/
80$ ./kmemtraced
81Wait a bit, then stop it with CTRL+C.
82$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
83 # overrun, should
84 # be zero.
85$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
86 check its correctness]
87$ ./kmemtrace-report
88
89Now you should have a nice and short summary of how the allocator performs.
90
91IV. FAQ and known issues
92========================
93
94Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
95this? Should I worry?
96A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
97large the number is. You can fix it by supplying a higher
98'kmemtrace.subbufs=N' kernel parameter.
99---
100
101Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
102A: This is a bug and should be reported. It can occur for a variety of
103reasons:
104 - possible bugs in relay code
105 - possible misuse of relay by kmemtrace
106 - timestamps being collected unorderly
107Or you may fix it yourself and send us a patch.
108---
109
110Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
111A: This is a known issue and I'm working on it. These might be true errors
112in kernel code, which may have inconsistent behavior (e.g. allocating memory
113with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
114out this behavior may work with SLAB, but may fail with other allocators.
115
116It may also be due to lack of tracing in some unusual allocator functions.
117
118We don't want bug reports regarding this issue yet.
119---
120
121V. See also
122===========
123
124Documentation/kernel-parameters.txt
125Documentation/ABI/testing/debugfs-kmemtrace
126
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 6aaaeb38730c..be45dbb9d7f2 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -8,7 +8,8 @@ The current memory policy support was added to Linux 2.6 around May 2004. This
8document attempts to describe the concepts and APIs of the 2.6 memory policy 8document attempts to describe the concepts and APIs of the 2.6 memory policy
9support. 9support.
10 10
11Memory policies should not be confused with cpusets (Documentation/cpusets.txt) 11Memory policies should not be confused with cpusets
12(Documentation/cgroups/cpusets.txt)
12which is an administrative mechanism for restricting the nodes from which 13which is an administrative mechanism for restricting the nodes from which
13memory may be allocated by a set of processes. Memory policies are a 14memory may be allocated by a set of processes. Memory policies are a
14programming interface that a NUMA-aware application can take advantage of. When 15programming interface that a NUMA-aware application can take advantage of. When
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index d5fdfd34bbaf..6513fe2d90b8 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -37,7 +37,8 @@ locations.
37 37
38Larger installations usually partition the system using cpusets into 38Larger installations usually partition the system using cpusets into
39sections of nodes. Paul Jackson has equipped cpusets with the ability to 39sections of nodes. Paul Jackson has equipped cpusets with the ability to
40move pages when a task is moved to another cpuset (See ../cpusets.txt). 40move pages when a task is moved to another cpuset (See
41Documentation/cgroups/cpusets.txt).
41Cpusets allows the automation of process locality. If a task is moved to 42Cpusets allows the automation of process locality. If a task is moved to
42a new cpuset then also all its pages are moved with it so that the 43a new cpuset then also all its pages are moved with it so that the
43performance of the process does not sink dramatically. Also the pages 44performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644
index 000000000000..607b1a016064
--- /dev/null
+++ b/Documentation/x86/earlyprintk.txt
@@ -0,0 +1,101 @@
1
2Mini-HOWTO for using the earlyprintk=dbgp boot option with a
3USB2 Debug port key and a debug cable, on x86 systems.
4
5You need two computers, the 'USB debug key' special gadget and
6and two USB cables, connected like this:
7
8 [host/target] <-------> [USB debug key] <-------> [client/console]
9
101. There are three specific hardware requirements:
11
12 a.) Host/target system needs to have USB debug port capability.
13
14 You can check this capability by looking at a 'Debug port' bit in
15 the lspci -vvv output:
16
17 # lspci -vvv
18 ...
19 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
20 Subsystem: Lenovo ThinkPad T61
21 Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
22 Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
23 Latency: 0
24 Interrupt: pin D routed to IRQ 19
25 Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
26 Capabilities: [50] Power Management version 2
27 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
28 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
29 Capabilities: [58] Debug port: BAR=1 offset=00a0
30 ^^^^^^^^^^^ <==================== [ HERE ]
31 Kernel driver in use: ehci_hcd
32 Kernel modules: ehci-hcd
33 ...
34
35( If your system does not list a debug port capability then you probably
36 wont be able to use the USB debug key. )
37
38 b.) You also need a Netchip USB debug cable/key:
39
40 http://www.plxtech.com/products/NET2000/NET20DC/default.asp
41
42 This is a small blue plastic connector with two USB connections,
43 it draws power from its USB connections.
44
45 c.) Thirdly, you need a second client/console system with a regular USB port.
46
472. Software requirements:
48
49 a.) On the host/target system:
50
51 You need to enable the following kernel config option:
52
53 CONFIG_EARLY_PRINTK_DBGP=y
54
55 And you need to add the boot command line: "earlyprintk=dbgp".
56 (If you are using Grub, append it to the 'kernel' line in
57 /etc/grub.conf)
58
59 NOTE: normally earlyprintk console gets turned off once the
60 regular console is alive - use "earlyprintk=dbgp,keep" to keep
61 this channel open beyond early bootup. This can be useful for
62 debugging crashes under Xorg, etc.
63
64 b.) On the client/console system:
65
66 You should enable the following kernel config option:
67
68 CONFIG_USB_SERIAL_DEBUG=y
69
70 On the next bootup with the modified kernel you should
71 get a /dev/ttyUSBx device(s).
72
73 Now this channel of kernel messages is ready to be used: start
74 your favorite terminal emulator (minicom, etc.) and set
75 it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
76 see the raw output.
77
78 c.) On Nvidia Southbridge based systems: the kernel will try to probe
79 and find out which port has debug device connected.
80
813. Testing that it works fine:
82
83 You can test the output by using earlyprintk=dbgp,keep and provoking
84 kernel messages on the host/target system. You can provoke a harmless
85 kernel message by for example doing:
86
87 echo h > /proc/sysrq-trigger
88
89 On the host/target system you should see this help line in "dmesg" output:
90
91 SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
92
93 On the client/console system do:
94
95 cat /dev/ttyUSB0
96
97 And you should see the help line above displayed shortly after you've
98 provoked it on the host system.
99
100If it does not work then please ask about it on the linux-kernel@vger.kernel.org
101mailing list or contact the x86 maintainers.
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
index 33bb56655991..0f11d9becb0b 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
@@ -7,7 +7,8 @@ you can create fake NUMA nodes that represent contiguous chunks of memory and
7assign them to cpusets and their attached tasks. This is a way of limiting the 7assign them to cpusets and their attached tasks. This is a way of limiting the
8amount of system memory that are available to a certain class of tasks. 8amount of system memory that are available to a certain class of tasks.
9 9
10For more information on the features of cpusets, see Documentation/cpusets.txt. 10For more information on the features of cpusets, see
11Documentation/cgroups/cpusets.txt.
11There are a number of different configurations you can use for your needs. For 12There are a number of different configurations you can use for your needs. For
12more information on the numa=fake command line option and its various ways of 13more information on the numa=fake command line option and its various ways of
13configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. 14configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt.
@@ -32,7 +33,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg:
32 On node 3 totalpages: 131072 33 On node 3 totalpages: 131072
33 34
34Now following the instructions for mounting the cpusets filesystem from 35Now following the instructions for mounting the cpusets filesystem from
35Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory 36Documentation/cgroups/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
36address spaces) to individual cpusets: 37address spaces) to individual cpusets:
37 38
38 [root@xroads /]# mkdir exampleset 39 [root@xroads /]# mkdir exampleset