aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-block34
-rw-r--r--Documentation/ABI/testing/sysfs-bus-css35
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-memmap71
-rw-r--r--Documentation/HOWTO2
-rw-r--r--Documentation/IRQ-affinity.txt37
-rw-r--r--Documentation/accounting/taskstats-struct.txt6
-rw-r--r--Documentation/auxdisplay/cfag12864b4
-rw-r--r--Documentation/auxdisplay/cfag12864b-example.c2
-rw-r--r--Documentation/auxdisplay/ks01084
-rw-r--r--Documentation/block/data-integrity.txt327
-rw-r--r--Documentation/cgroups.txt4
-rw-r--r--Documentation/controllers/devices.txt8
-rw-r--r--Documentation/cpusets.txt9
-rw-r--r--Documentation/cputopology.txt26
-rw-r--r--Documentation/feature-removal-schedule.txt16
-rw-r--r--Documentation/filesystems/ext4.txt125
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt114
-rw-r--r--Documentation/filesystems/proc.txt29
-rw-r--r--Documentation/ftrace.txt1361
-rw-r--r--Documentation/i2c/busses/i2c-i81047
-rw-r--r--Documentation/i2c/busses/i2c-prosavage23
-rw-r--r--Documentation/i2c/busses/i2c-savage426
-rw-r--r--Documentation/i2c/fault-codes127
-rw-r--r--Documentation/i2c/smbus-protocol4
-rw-r--r--Documentation/i2c/writing-clients69
-rw-r--r--Documentation/ioctl-number.txt1
-rw-r--r--Documentation/kdump/kdump.txt2
-rw-r--r--Documentation/kernel-parameters.txt59
-rw-r--r--Documentation/networking/ip-sysctl.txt268
-rw-r--r--Documentation/networking/s2io.txt6
-rw-r--r--Documentation/nmi_watchdog.txt16
-rw-r--r--Documentation/scheduler/sched-domains.txt7
-rw-r--r--Documentation/scheduler/sched-rt-group.txt4
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt17
-rw-r--r--Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl4
-rw-r--r--Documentation/tracers/mmiotrace.txt164
-rw-r--r--Documentation/video4linux/CARDLIST.au08282
-rw-r--r--Documentation/vm/slabinfo.c4
-rw-r--r--Documentation/vm/slub.txt2
-rw-r--r--Documentation/x86/i386/IO-APIC.txt (renamed from Documentation/i386/IO-APIC.txt)0
-rw-r--r--Documentation/x86/i386/boot.txt (renamed from Documentation/i386/boot.txt)79
-rw-r--r--Documentation/x86/i386/usb-legacy-support.txt (renamed from Documentation/i386/usb-legacy-support.txt)0
-rw-r--r--Documentation/x86/i386/zero-page.txt (renamed from Documentation/i386/zero-page.txt)0
-rw-r--r--Documentation/x86/x86_64/00-INDEX (renamed from Documentation/x86_64/00-INDEX)0
-rw-r--r--Documentation/x86/x86_64/boot-options.txt (renamed from Documentation/x86_64/boot-options.txt)0
-rw-r--r--Documentation/x86/x86_64/cpu-hotplug-spec (renamed from Documentation/x86_64/cpu-hotplug-spec)0
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets (renamed from Documentation/x86_64/fake-numa-for-cpusets)0
-rw-r--r--Documentation/x86/x86_64/kernel-stacks (renamed from Documentation/x86_64/kernel-stacks)0
-rw-r--r--Documentation/x86/x86_64/machinecheck (renamed from Documentation/x86_64/machinecheck)0
-rw-r--r--Documentation/x86/x86_64/mm.txt (renamed from Documentation/x86_64/mm.txt)5
-rw-r--r--Documentation/x86/x86_64/uefi.txt (renamed from Documentation/x86_64/uefi.txt)4
51 files changed, 2817 insertions, 337 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 4bd9ea539129..44f52a4f5903 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -26,3 +26,37 @@ Description:
26 I/O statistics of partition <part>. The format is the 26 I/O statistics of partition <part>. The format is the
27 same as the above-written /sys/block/<disk>/stat 27 same as the above-written /sys/block/<disk>/stat
28 format. 28 format.
29
30
31What: /sys/block/<disk>/integrity/format
32Date: June 2008
33Contact: Martin K. Petersen <martin.petersen@oracle.com>
34Description:
35 Metadata format for integrity capable block device.
36 E.g. T10-DIF-TYPE1-CRC.
37
38
39What: /sys/block/<disk>/integrity/read_verify
40Date: June 2008
41Contact: Martin K. Petersen <martin.petersen@oracle.com>
42Description:
43 Indicates whether the block layer should verify the
44 integrity of read requests serviced by devices that
45 support sending integrity metadata.
46
47
48What: /sys/block/<disk>/integrity/tag_size
49Date: June 2008
50Contact: Martin K. Petersen <martin.petersen@oracle.com>
51Description:
52 Number of bytes of integrity tag space available per
53 512 bytes of data.
54
55
56What: /sys/block/<disk>/integrity/write_generate
57Date: June 2008
58Contact: Martin K. Petersen <martin.petersen@oracle.com>
59Description:
60 Indicates whether the block layer should automatically
61 generate checksums for write requests bound for
62 devices that support receiving integrity metadata.
diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
new file mode 100644
index 000000000000..b585ec258a08
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -0,0 +1,35 @@
1What: /sys/bus/css/devices/.../type
2Date: March 2008
3Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
4 linux-s390@vger.kernel.org
5Description: Contains the subchannel type, as reported by the hardware.
6 This attribute is present for all subchannel types.
7
8What: /sys/bus/css/devices/.../modalias
9Date: March 2008
10Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
11 linux-s390@vger.kernel.org
12Description: Contains the module alias as reported with uevents.
13 It is of the format css:t<type> and present for all
14 subchannel types.
15
16What: /sys/bus/css/drivers/io_subchannel/.../chpids
17Date: December 2002
18Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
19 linux-s390@vger.kernel.org
20Description: Contains the ids of the channel paths used by this
21 subchannel, as reported by the channel subsystem
22 during subchannel recognition.
23 Note: This is an I/O-subchannel specific attribute.
24Users: s390-tools, HAL
25
26What: /sys/bus/css/drivers/io_subchannel/.../pimpampom
27Date: December 2002
28Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
29 linux-s390@vger.kernel.org
30Description: Contains the PIM/PAM/POM values, as reported by the
31 channel subsystem when last queried by the common I/O
32 layer (this implies that this attribute is not neccessarily
33 in sync with the values current in the channel subsystem).
34 Note: This is an I/O-subchannel specific attribute.
35Users: s390-tools, HAL
diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap
new file mode 100644
index 000000000000..0d99ee6ae02e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-memmap
@@ -0,0 +1,71 @@
1What: /sys/firmware/memmap/
2Date: June 2008
3Contact: Bernhard Walle <bwalle@suse.de>
4Description:
5 On all platforms, the firmware provides a memory map which the
6 kernel reads. The resources from that memory map are registered
7 in the kernel resource tree and exposed to userspace via
8 /proc/iomem (together with other resources).
9
10 However, on most architectures that firmware-provided memory
11 map is modified afterwards by the kernel itself, either because
12 the kernel merges that memory map with other information or
13 just because the user overwrites that memory map via command
14 line.
15
16 kexec needs the raw firmware-provided memory map to setup the
17 parameter segment of the kernel that should be booted with
18 kexec. Also, the raw memory map is useful for debugging. For
19 that reason, /sys/firmware/memmap is an interface that provides
20 the raw memory map to userspace.
21
22 The structure is as follows: Under /sys/firmware/memmap there
23 are subdirectories with the number of the entry as their name:
24
25 /sys/firmware/memmap/0
26 /sys/firmware/memmap/1
27 /sys/firmware/memmap/2
28 /sys/firmware/memmap/3
29 ...
30
31 The maximum depends on the number of memory map entries provided
32 by the firmware. The order is just the order that the firmware
33 provides.
34
35 Each directory contains three files:
36
37 start : The start address (as hexadecimal number with the
38 '0x' prefix).
39 end : The end address, inclusive (regardless whether the
40 firmware provides inclusive or exclusive ranges).
41 type : Type of the entry as string. See below for a list of
42 valid types.
43
44 So, for example:
45
46 /sys/firmware/memmap/0/start
47 /sys/firmware/memmap/0/end
48 /sys/firmware/memmap/0/type
49 /sys/firmware/memmap/1/start
50 ...
51
52 Currently following types exist:
53
54 - System RAM
55 - ACPI Tables
56 - ACPI Non-volatile Storage
57 - reserved
58
59 Following shell snippet can be used to display that memory
60 map in a human-readable format:
61
62 -------------------- 8< ----------------------------------------
63 #!/bin/bash
64 cd /sys/firmware/memmap
65 for dir in * ; do
66 start=$(cat $dir/start)
67 end=$(cat $dir/end)
68 type=$(cat $dir/type)
69 printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type"
70 done
71 -------------------- >8 ----------------------------------------
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index 0291ade44c17..619e8caf30db 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -377,7 +377,7 @@ Bug Reporting
377bugzilla.kernel.org is where the Linux kernel developers track kernel 377bugzilla.kernel.org is where the Linux kernel developers track kernel
378bugs. Users are encouraged to report all bugs that they find in this 378bugs. Users are encouraged to report all bugs that they find in this
379tool. For details on how to use the kernel bugzilla, please see: 379tool. For details on how to use the kernel bugzilla, please see:
380 http://test.kernel.org/bugzilla/faq.html 380 http://bugzilla.kernel.org/page.cgi?id=faq.html
381 381
382The file REPORTING-BUGS in the main kernel source directory has a good 382The file REPORTING-BUGS in the main kernel source directory has a good
383template for how to report a possible kernel bug, and details what kind 383template for how to report a possible kernel bug, and details what kind
diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt
index 938d7dd05490..b4a615b78403 100644
--- a/Documentation/IRQ-affinity.txt
+++ b/Documentation/IRQ-affinity.txt
@@ -1,17 +1,26 @@
1ChangeLog:
2 Started by Ingo Molnar <mingo@redhat.com>
3 Update by Max Krasnyansky <maxk@qualcomm.com>
1 4
2SMP IRQ affinity, started by Ingo Molnar <mingo@redhat.com> 5SMP IRQ affinity
3
4 6
5/proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted 7/proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted
6for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed 8for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed
7to turn off all CPUs, and if an IRQ controller does not support IRQ 9to turn off all CPUs, and if an IRQ controller does not support IRQ
8affinity then the value will not change from the default 0xffffffff. 10affinity then the value will not change from the default 0xffffffff.
9 11
12/proc/irq/default_smp_affinity specifies default affinity mask that applies
13to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask
14will be set to the default mask. It can then be changed as described above.
15Default mask is 0xffffffff.
16
10Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting 17Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
11the IRQ to CPU4-7 (this is an 8-CPU SMP box): 18it to CPU4-7 (this is an 8-CPU SMP box):
12 19
20[root@moon 44]# cd /proc/irq/44
13[root@moon 44]# cat smp_affinity 21[root@moon 44]# cat smp_affinity
14ffffffff 22ffffffff
23
15[root@moon 44]# echo 0f > smp_affinity 24[root@moon 44]# echo 0f > smp_affinity
16[root@moon 44]# cat smp_affinity 25[root@moon 44]# cat smp_affinity
170000000f 260000000f
@@ -21,17 +30,27 @@ PING hell (195.4.7.3): 56 data bytes
21--- hell ping statistics --- 30--- hell ping statistics ---
226029 packets transmitted, 6027 packets received, 0% packet loss 316029 packets transmitted, 6027 packets received, 0% packet loss
23round-trip min/avg/max = 0.1/0.1/0.4 ms 32round-trip min/avg/max = 0.1/0.1/0.4 ms
24[root@moon 44]# cat /proc/interrupts | grep 44: 33[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
25 44: 0 1785 1785 1783 1783 1 34 CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
261 0 IO-APIC-level eth1 35 44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1
36
37As can be seen from the line above IRQ44 was delivered only to the first four
38processors (0-3).
39Now lets restrict that IRQ to CPU(4-7).
40
27[root@moon 44]# echo f0 > smp_affinity 41[root@moon 44]# echo f0 > smp_affinity
42[root@moon 44]# cat smp_affinity
43000000f0
28[root@moon 44]# ping -f h 44[root@moon 44]# ping -f h
29PING hell (195.4.7.3): 56 data bytes 45PING hell (195.4.7.3): 56 data bytes
30.. 46..
31--- hell ping statistics --- 47--- hell ping statistics ---
322779 packets transmitted, 2777 packets received, 0% packet loss 482779 packets transmitted, 2777 packets received, 0% packet loss
33round-trip min/avg/max = 0.1/0.5/585.4 ms 49round-trip min/avg/max = 0.1/0.5/585.4 ms
34[root@moon 44]# cat /proc/interrupts | grep 44: 50[root@moon 44]# cat /proc/interrupts | 'CPU\|44:'
35 44: 1068 1785 1785 1784 1784 1069 1070 1069 IO-APIC-level eth1 51 CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
36[root@moon 44]# 52 44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1
53
54This time around IRQ44 was delivered only to the last four processors.
55i.e counters for the CPU0-3 did not change.
37 56
diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
index 8aa7529f8258..cd784f46bf8a 100644
--- a/Documentation/accounting/taskstats-struct.txt
+++ b/Documentation/accounting/taskstats-struct.txt
@@ -24,6 +24,8 @@ There are three different groups of fields in the struct taskstats:
24 24
254) Per-task and per-thread context switch count statistics 254) Per-task and per-thread context switch count statistics
26 26
275) Time accounting for SMT machines
28
27Future extension should add fields to the end of the taskstats struct, and 29Future extension should add fields to the end of the taskstats struct, and
28should not change the relative position of each field within the struct. 30should not change the relative position of each field within the struct.
29 31
@@ -164,4 +166,8 @@ struct taskstats {
164 __u64 nvcsw; /* Context voluntary switch counter */ 166 __u64 nvcsw; /* Context voluntary switch counter */
165 __u64 nivcsw; /* Context involuntary switch counter */ 167 __u64 nivcsw; /* Context involuntary switch counter */
166 168
1695) Time accounting for SMT machines
170 __u64 ac_utimescaled; /* utime scaled on frequency etc */
171 __u64 ac_stimescaled; /* stime scaled on frequency etc */
172 __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
167} 173}
diff --git a/Documentation/auxdisplay/cfag12864b b/Documentation/auxdisplay/cfag12864b
index b714183d4125..eb7be393a510 100644
--- a/Documentation/auxdisplay/cfag12864b
+++ b/Documentation/auxdisplay/cfag12864b
@@ -3,7 +3,7 @@
3 =================================== 3 ===================================
4 4
5License: GPLv2 5License: GPLv2
6Author & Maintainer: Miguel Ojeda Sandonis <maxextreme@gmail.com> 6Author & Maintainer: Miguel Ojeda Sandonis
7Date: 2006-10-27 7Date: 2006-10-27
8 8
9 9
@@ -22,7 +22,7 @@ Date: 2006-10-27
221. DRIVER INFORMATION 221. DRIVER INFORMATION
23--------------------- 23---------------------
24 24
25This driver support one cfag12864b display at time. 25This driver supports a cfag12864b LCD.
26 26
27 27
28--------------------- 28---------------------
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
index 7bfac354d4c9..2caeea5e4993 100644
--- a/Documentation/auxdisplay/cfag12864b-example.c
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -4,7 +4,7 @@
4 * Description: cfag12864b LCD userspace example program 4 * Description: cfag12864b LCD userspace example program
5 * License: GPLv2 5 * License: GPLv2
6 * 6 *
7 * Author: Copyright (C) Miguel Ojeda Sandonis <maxextreme@gmail.com> 7 * Author: Copyright (C) Miguel Ojeda Sandonis
8 * Date: 2006-10-31 8 * Date: 2006-10-31
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
diff --git a/Documentation/auxdisplay/ks0108 b/Documentation/auxdisplay/ks0108
index 92b03b60c613..8ddda0c8ceef 100644
--- a/Documentation/auxdisplay/ks0108
+++ b/Documentation/auxdisplay/ks0108
@@ -3,7 +3,7 @@
3 ========================================== 3 ==========================================
4 4
5License: GPLv2 5License: GPLv2
6Author & Maintainer: Miguel Ojeda Sandonis <maxextreme@gmail.com> 6Author & Maintainer: Miguel Ojeda Sandonis
7Date: 2006-10-27 7Date: 2006-10-27
8 8
9 9
@@ -21,7 +21,7 @@ Date: 2006-10-27
211. DRIVER INFORMATION 211. DRIVER INFORMATION
22--------------------- 22---------------------
23 23
24This driver support the ks0108 LCD controller. 24This driver supports the ks0108 LCD controller.
25 25
26 26
27--------------------- 27---------------------
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
new file mode 100644
index 000000000000..e9dc8d86adc7
--- /dev/null
+++ b/Documentation/block/data-integrity.txt
@@ -0,0 +1,327 @@
1----------------------------------------------------------------------
21. INTRODUCTION
3
4Modern filesystems feature checksumming of data and metadata to
5protect against data corruption. However, the detection of the
6corruption is done at read time which could potentially be months
7after the data was written. At that point the original data that the
8application tried to write is most likely lost.
9
10The solution is to ensure that the disk is actually storing what the
11application meant it to. Recent additions to both the SCSI family
12protocols (SBC Data Integrity Field, SCC protection proposal) as well
13as SATA/T13 (External Path Protection) try to remedy this by adding
14support for appending integrity metadata to an I/O. The integrity
15metadata (or protection information in SCSI terminology) includes a
16checksum for each sector as well as an incrementing counter that
17ensures the individual sectors are written in the right order. And
18for some protection schemes also that the I/O is written to the right
19place on disk.
20
21Current storage controllers and devices implement various protective
22measures, for instance checksumming and scrubbing. But these
23technologies are working in their own isolated domains or at best
24between adjacent nodes in the I/O path. The interesting thing about
25DIF and the other integrity extensions is that the protection format
26is well defined and every node in the I/O path can verify the
27integrity of the I/O and reject it if corruption is detected. This
28allows not only corruption prevention but also isolation of the point
29of failure.
30
31----------------------------------------------------------------------
322. THE DATA INTEGRITY EXTENSIONS
33
34As written, the protocol extensions only protect the path between
35controller and storage device. However, many controllers actually
36allow the operating system to interact with the integrity metadata
37(IMD). We have been working with several FC/SAS HBA vendors to enable
38the protection information to be transferred to and from their
39controllers.
40
41The SCSI Data Integrity Field works by appending 8 bytes of protection
42information to each sector. The data + integrity metadata is stored
43in 520 byte sectors on disk. Data + IMD are interleaved when
44transferred between the controller and target. The T13 proposal is
45similar.
46
47Because it is highly inconvenient for operating systems to deal with
48520 (and 4104) byte sectors, we approached several HBA vendors and
49encouraged them to allow separation of the data and integrity metadata
50scatter-gather lists.
51
52The controller will interleave the buffers on write and split them on
53read. This means that the Linux can DMA the data buffers to and from
54host memory without changes to the page cache.
55
56Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
57is somewhat heavy to compute in software. Benchmarks found that
58calculating this checksum had a significant impact on system
59performance for a number of workloads. Some controllers allow a
60lighter-weight checksum to be used when interfacing with the operating
61system. Emulex, for instance, supports the TCP/IP checksum instead.
62The IP checksum received from the OS is converted to the 16-bit CRC
63when writing and vice versa. This allows the integrity metadata to be
64generated by Linux or the application at very low cost (comparable to
65software RAID5).
66
67The IP checksum is weaker than the CRC in terms of detecting bit
68errors. However, the strength is really in the separation of the data
69buffers and the integrity metadata. These two distinct buffers much
70match up for an I/O to complete.
71
72The separation of the data and integrity metadata buffers as well as
73the choice in checksums is referred to as the Data Integrity
74Extensions. As these extensions are outside the scope of the protocol
75bodies (T10, T13), Oracle and its partners are trying to standardize
76them within the Storage Networking Industry Association.
77
78----------------------------------------------------------------------
793. KERNEL CHANGES
80
81The data integrity framework in Linux enables protection information
82to be pinned to I/Os and sent to/received from controllers that
83support it.
84
85The advantage to the integrity extensions in SCSI and SATA is that
86they enable us to protect the entire path from application to storage
87device. However, at the same time this is also the biggest
88disadvantage. It means that the protection information must be in a
89format that can be understood by the disk.
90
91Generally Linux/POSIX applications are agnostic to the intricacies of
92the storage devices they are accessing. The virtual filesystem switch
93and the block layer make things like hardware sector size and
94transport protocols completely transparent to the application.
95
96However, this level of detail is required when preparing the
97protection information to send to a disk. Consequently, the very
98concept of an end-to-end protection scheme is a layering violation.
99It is completely unreasonable for an application to be aware whether
100it is accessing a SCSI or SATA disk.
101
102The data integrity support implemented in Linux attempts to hide this
103from the application. As far as the application (and to some extent
104the kernel) is concerned, the integrity metadata is opaque information
105that's attached to the I/O.
106
107The current implementation allows the block layer to automatically
108generate the protection information for any I/O. Eventually the
109intent is to move the integrity metadata calculation to userspace for
110user data. Metadata and other I/O that originates within the kernel
111will still use the automatic generation interface.
112
113Some storage devices allow each hardware sector to be tagged with a
11416-bit value. The owner of this tag space is the owner of the block
115device. I.e. the filesystem in most cases. The filesystem can use
116this extra space to tag sectors as they see fit. Because the tag
117space is limited, the block interface allows tagging bigger chunks by
118way of interleaving. This way, 8*16 bits of information can be
119attached to a typical 4KB filesystem block.
120
121This also means that applications such as fsck and mkfs will need
122access to manipulate the tags from user space. A passthrough
123interface for this is being worked on.
124
125
126----------------------------------------------------------------------
1274. BLOCK LAYER IMPLEMENTATION DETAILS
128
1294.1 BIO
130
131The data integrity patches add a new field to struct bio when
132CONFIG_BLK_DEV_INTEGRITY is enabled. bio->bi_integrity is a pointer
133to a struct bip which contains the bio integrity payload. Essentially
134a bip is a trimmed down struct bio which holds a bio_vec containing
135the integrity metadata and the required housekeeping information (bvec
136pool, vector count, etc.)
137
138A kernel subsystem can enable data integrity protection on a bio by
139calling bio_integrity_alloc(bio). This will allocate and attach the
140bip to the bio.
141
142Individual pages containing integrity metadata can subsequently be
143attached using bio_integrity_add_page().
144
145bio_free() will automatically free the bip.
146
147
1484.2 BLOCK DEVICE
149
150Because the format of the protection data is tied to the physical
151disk, each block device has been extended with a block integrity
152profile (struct blk_integrity). This optional profile is registered
153with the block layer using blk_integrity_register().
154
155The profile contains callback functions for generating and verifying
156the protection data, as well as getting and setting application tags.
157The profile also contains a few constants to aid in completing,
158merging and splitting the integrity metadata.
159
160Layered block devices will need to pick a profile that's appropriate
161for all subdevices. blk_integrity_compare() can help with that. DM
162and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6
163will require extra work due to the application tag.
164
165
166----------------------------------------------------------------------
1675.0 BLOCK LAYER INTEGRITY API
168
1695.1 NORMAL FILESYSTEM
170
171 The normal filesystem is unaware that the underlying block device
172 is capable of sending/receiving integrity metadata. The IMD will
173 be automatically generated by the block layer at submit_bio() time
174 in case of a WRITE. A READ request will cause the I/O integrity
175 to be verified upon completion.
176
177 IMD generation and verification can be toggled using the
178
179 /sys/block/<bdev>/integrity/write_generate
180
181 and
182
183 /sys/block/<bdev>/integrity/read_verify
184
185 flags.
186
187
1885.2 INTEGRITY-AWARE FILESYSTEM
189
190 A filesystem that is integrity-aware can prepare I/Os with IMD
191 attached. It can also use the application tag space if this is
192 supported by the block device.
193
194
195 int bdev_integrity_enabled(block_device, int rw);
196
197 bdev_integrity_enabled() will return 1 if the block device
198 supports integrity metadata transfer for the data direction
199 specified in 'rw'.
200
201 bdev_integrity_enabled() honors the write_generate and
202 read_verify flags in sysfs and will respond accordingly.
203
204
205 int bio_integrity_prep(bio);
206
207 To generate IMD for WRITE and to set up buffers for READ, the
208 filesystem must call bio_integrity_prep(bio).
209
210 Prior to calling this function, the bio data direction and start
211 sector must be set, and the bio should have all data pages
212 added. It is up to the caller to ensure that the bio does not
213 change while I/O is in progress.
214
215 bio_integrity_prep() should only be called if
216 bio_integrity_enabled() returned 1.
217
218
219 int bio_integrity_tag_size(bio);
220
221 If the filesystem wants to use the application tag space it will
222 first have to find out how much storage space is available.
223 Because tag space is generally limited (usually 2 bytes per
224 sector regardless of sector size), the integrity framework
225 supports interleaving the information between the sectors in an
226 I/O.
227
228 Filesystems can call bio_integrity_tag_size(bio) to find out how
229 many bytes of storage are available for that particular bio.
230
231 Another option is bdev_get_tag_size(block_device) which will
232 return the number of available bytes per hardware sector.
233
234
235 int bio_integrity_set_tag(bio, void *tag_buf, len);
236
237 After a successful return from bio_integrity_prep(),
238 bio_integrity_set_tag() can be used to attach an opaque tag
239 buffer to a bio. Obviously this only makes sense if the I/O is
240 a WRITE.
241
242
243 int bio_integrity_get_tag(bio, void *tag_buf, len);
244
245 Similarly, at READ I/O completion time the filesystem can
246 retrieve the tag buffer using bio_integrity_get_tag().
247
248
2496.3 PASSING EXISTING INTEGRITY METADATA
250
251 Filesystems that either generate their own integrity metadata or
252 are capable of transferring IMD from user space can use the
253 following calls:
254
255
256 struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);
257
258 Allocates the bio integrity payload and hangs it off of the bio.
259 nr_pages indicate how many pages of protection data need to be
260 stored in the integrity bio_vec list (similar to bio_alloc()).
261
262 The integrity payload will be freed at bio_free() time.
263
264
265 int bio_integrity_add_page(bio, page, len, offset);
266
267 Attaches a page containing integrity metadata to an existing
268 bio. The bio must have an existing bip,
269 i.e. bio_integrity_alloc() must have been called. For a WRITE,
270 the integrity metadata in the pages must be in a format
271 understood by the target device with the notable exception that
272 the sector numbers will be remapped as the request traverses the
273 I/O stack. This implies that the pages added using this call
274 will be modified during I/O! The first reference tag in the
275 integrity metadata must have a value of bip->bip_sector.
276
277 Pages can be added using bio_integrity_add_page() as long as
278 there is room in the bip bio_vec array (nr_pages).
279
280 Upon completion of a READ operation, the attached pages will
281 contain the integrity metadata received from the storage device.
282 It is up to the receiver to process them and verify data
283 integrity upon completion.
284
285
2866.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY
287 METADATA
288
289 To enable integrity exchange on a block device the gendisk must be
290 registered as capable:
291
292 int blk_integrity_register(gendisk, blk_integrity);
293
294 The blk_integrity struct is a template and should contain the
295 following:
296
297 static struct blk_integrity my_profile = {
298 .name = "STANDARDSBODY-TYPE-VARIANT-CSUM",
299 .generate_fn = my_generate_fn,
300 .verify_fn = my_verify_fn,
301 .get_tag_fn = my_get_tag_fn,
302 .set_tag_fn = my_set_tag_fn,
303 .tuple_size = sizeof(struct my_tuple_size),
304 .tag_size = <tag bytes per hw sector>,
305 };
306
307 'name' is a text string which will be visible in sysfs. This is
308 part of the userland API so chose it carefully and never change
309 it. The format is standards body-type-variant.
310 E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
311
312 'generate_fn' generates appropriate integrity metadata (for WRITE).
313
314 'verify_fn' verifies that the data buffer matches the integrity
315 metadata.
316
317 'tuple_size' must be set to match the size of the integrity
318 metadata per sector. I.e. 8 for DIF and EPP.
319
320 'tag_size' must be set to identify how many bytes of tag space
321 are available per hardware sector. For DIF this is either 2 or
322 0 depending on the value of the Control Mode Page ATO bit.
323
324 See 6.2 for a description of get_tag_fn and set_tag_fn.
325
326----------------------------------------------------------------------
3272007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
index 824fc0274471..d9014aa0eb68 100644
--- a/Documentation/cgroups.txt
+++ b/Documentation/cgroups.txt
@@ -390,6 +390,10 @@ If you have several tasks to attach, you have to do it one after another:
390 ... 390 ...
391# /bin/echo PIDn > tasks 391# /bin/echo PIDn > tasks
392 392
393You can attach the current shell task by echoing 0:
394
395# echo 0 > tasks
396
3933. Kernel API 3973. Kernel API
394============= 398=============
395 399
diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt
index 4dcea42432c2..7cc6e6a60672 100644
--- a/Documentation/controllers/devices.txt
+++ b/Documentation/controllers/devices.txt
@@ -13,7 +13,7 @@ either an integer or * for all. Access is a composition of r
13The root device cgroup starts with rwm to 'all'. A child device 13The root device cgroup starts with rwm to 'all'. A child device
14cgroup gets a copy of the parent. Administrators can then remove 14cgroup gets a copy of the parent. Administrators can then remove
15devices from the whitelist or add new entries. A child cgroup can 15devices from the whitelist or add new entries. A child cgroup can
16never receive a device access which is denied its parent. However 16never receive a device access which is denied by its parent. However
17when a device access is removed from a parent it will not also be 17when a device access is removed from a parent it will not also be
18removed from the child(ren). 18removed from the child(ren).
19 19
@@ -29,7 +29,11 @@ allows cgroup 1 to read and mknod the device usually known as
29 29
30 echo a > /cgroups/1/devices.deny 30 echo a > /cgroups/1/devices.deny
31 31
32will remove the default 'a *:* mrw' entry. 32will remove the default 'a *:* rwm' entry. Doing
33
34 echo a > /cgroups/1/devices.allow
35
36will add the 'a *:* rwm' entry to the whitelist.
33 37
343. Security 383. Security
35 39
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 353504de3084..1f5a924d1e56 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -154,13 +154,15 @@ browsing and modifying the cpusets presently known to the kernel. No
154new system calls are added for cpusets - all support for querying and 154new system calls are added for cpusets - all support for querying and
155modifying cpusets is via this cpuset file system. 155modifying cpusets is via this cpuset file system.
156 156
157The /proc/<pid>/status file for each task has two added lines, 157The /proc/<pid>/status file for each task has four added lines,
158displaying the tasks cpus_allowed (on which CPUs it may be scheduled) 158displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
159and mems_allowed (on which Memory Nodes it may obtain memory), 159and mems_allowed (on which Memory Nodes it may obtain memory),
160in the format seen in the following example: 160in the two formats seen in the following example:
161 161
162 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff 162 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
163 Cpus_allowed_list: 0-127
163 Mems_allowed: ffffffff,ffffffff 164 Mems_allowed: ffffffff,ffffffff
165 Mems_allowed_list: 0-63
164 166
165Each cpuset is represented by a directory in the cgroup file system 167Each cpuset is represented by a directory in the cgroup file system
166containing (on top of the standard cgroup files) the following 168containing (on top of the standard cgroup files) the following
@@ -544,6 +546,9 @@ otherwise initial value -1 that indicates the cpuset has no request.
544 ( 4 : search nodes in a chunk of node [on NUMA system] ) 546 ( 4 : search nodes in a chunk of node [on NUMA system] )
545 ( 5 : search system wide [on NUMA system] ) 547 ( 5 : search system wide [on NUMA system] )
546 548
549The system default is architecture dependent. The system default
550can be changed using the relax_domain_level= boot parameter.
551
547This file is per-cpuset and affect the sched domain where the cpuset 552This file is per-cpuset and affect the sched domain where the cpuset
548belongs to. Therefore if the flag 'sched_load_balance' of a cpuset 553belongs to. Therefore if the flag 'sched_load_balance' of a cpuset
549is disabled, then 'sched_relax_domain_level' have no effect since 554is disabled, then 'sched_relax_domain_level' have no effect since
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index b61cb9564023..bd699da24666 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,9 +14,8 @@ represent the thread siblings to cpu X in the same physical package;
14To implement it in an architecture-neutral way, a new source file, 14To implement it in an architecture-neutral way, a new source file,
15drivers/base/topology.c, is to export the 4 attributes. 15drivers/base/topology.c, is to export the 4 attributes.
16 16
17If one architecture wants to support this feature, it just needs to 17For an architecture to support this feature, it must define some of
18implement 4 defines, typically in file include/asm-XXX/topology.h. 18these macros in include/asm-XXX/topology.h:
19The 4 defines are:
20#define topology_physical_package_id(cpu) 19#define topology_physical_package_id(cpu)
21#define topology_core_id(cpu) 20#define topology_core_id(cpu)
22#define topology_thread_siblings(cpu) 21#define topology_thread_siblings(cpu)
@@ -25,17 +24,10 @@ The 4 defines are:
25The type of **_id is int. 24The type of **_id is int.
26The type of siblings is cpumask_t. 25The type of siblings is cpumask_t.
27 26
28To be consistent on all architectures, the 4 attributes should have 27To be consistent on all architectures, include/linux/topology.h
29default values if their values are unavailable. Below is the rule. 28provides default definitions for any of the above macros that are
301) physical_package_id: If cpu has no physical package id, -1 is the 29not defined by include/asm-XXX/topology.h:
31default value. 301) physical_package_id: -1
322) core_id: If cpu doesn't support multi-core, its core id is 0. 312) core_id: 0
333) thread_siblings: Just include itself, if the cpu doesn't support 323) thread_siblings: just the given CPU
34HT/multi-thread. 334) core_siblings: just the given CPU
354) core_siblings: Just include itself, if the cpu doesn't support
36multi-core and HT/Multi-thread.
37
38So be careful when declaring the 4 defines in include/asm-XXX/topology.h.
39
40If an attribute isn't defined on an architecture, it won't be exported.
41
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 5b3f31faed56..65a1482457a8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -222,13 +222,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
222 222
223--------------------------- 223---------------------------
224 224
225What: i2c-i810, i2c-prosavage and i2c-savage4
226When: May 2008
227Why: These drivers are superseded by i810fb, intelfb and savagefb.
228Who: Jean Delvare <khali@linux-fr.org>
229
230---------------------------
231
232What (Why): 225What (Why):
233 - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files 226 - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files
234 (superseded by xt_TOS/xt_tos target & match) 227 (superseded by xt_TOS/xt_tos target & match)
@@ -312,3 +305,12 @@ When: 2.6.26
312Why: Implementation became generic; users should now include 305Why: Implementation became generic; users should now include
313 linux/semaphore.h instead. 306 linux/semaphore.h instead.
314Who: Matthew Wilcox <willy@linux.intel.com> 307Who: Matthew Wilcox <willy@linux.intel.com>
308
309---------------------------
310
311What: CONFIG_THERMAL_HWMON
312When: January 2009
313Why: This option was introduced just to allow older lm-sensors userspace
314 to keep working over the upgrade to 2.6.26. At the scheduled time of
315 removal fixed lm-sensors (2.x or 3.x) should be readily available.
316Who: Rene Herman <rene.herman@gmail.com>
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db8352..80e193d82e2e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
131. Quick usage instructions: 131. Quick usage instructions:
14=========================== 14===========================
15 15
16 - Grab updated e2fsprogs from 16 - Compile and install the latest version of e2fsprogs (as of this
17 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ 17 writing version 1.41) from:
18 This is a patchset on top of e2fsprogs-1.39, which can be found at 18
19 http://sourceforge.net/project/showfiles.php?group_id=2406
20
21 or
22
19 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ 23 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
20 24
21 - It's still mke2fs -j /dev/hda1 25 or grab the latest git repository from:
26
27 git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
28
29 - Create a new filesystem using the ext4dev filesystem type:
30
31 # mke2fs -t ext4dev /dev/hda1
32
33 Or configure an existing ext3 filesystem to support extents and set
34 the test_fs flag to indicate that it's ok for an in-development
35 filesystem to touch this filesystem:
22 36
23 - mount /dev/hda1 /wherever -t ext4dev 37 # tune2fs -O extents -E test_fs /dev/hda1
24 38
25 - To enable extents, 39 If the filesystem was created with 128 byte inodes, it can be
40 converted to use 256 byte for greater efficiency via:
26 41
27 mount /dev/hda1 /wherever -t ext4dev -o extents 42 # tune2fs -I 256 /dev/hda1
28 43
29 - The filesystem is compatible with the ext3 driver until you add a file 44 (Note: we currently do not have tools to convert an ext4dev
30 which has extents (ie: `mount -o extents', then create a file). 45 filesystem back to ext3; so please do not do try this on production
46 filesystems.)
31 47
32 NOTE: The "extents" mount flag is temporary. It will soon go away and 48 - Mounting:
33 extents will be enabled by the "-o extents" flag to mke2fs or tune2fs 49
50 # mount -t ext4dev /dev/hda1 /wherever
34 51
35 - When comparing performance with other filesystems, remember that 52 - When comparing performance with other filesystems, remember that
36 ext3/4 by default offers higher data integrity guarantees than most. So 53 ext3/4 by default offers higher data integrity guarantees than most.
37 when comparing with a metadata-only journalling filesystem, use `mount -o 54 So when comparing with a metadata-only journalling filesystem, such
38 data=writeback'. And you might as well use `mount -o nobh' too along 55 as ext3, use `mount -o data=writeback'. And you might as well use
39 with it. Making the journal larger than the mke2fs default often helps 56 `mount -o nobh' too along with it. Making the journal larger than
40 performance with metadata-intensive workloads. 57 the mke2fs default often helps performance with metadata-intensive
58 workloads.
41 59
422. Features 602. Features
43=========== 61===========
44 62
452.1 Currently available 632.1 Currently available
46 64
47* ability to use filesystems > 16TB 65* ability to use filesystems > 16TB (e2fsprogs support not available yet)
48* extent format reduces metadata overhead (RAM, IO for access, transactions) 66* extent format reduces metadata overhead (RAM, IO for access, transactions)
49* extent format more robust in face of on-disk corruption due to magics, 67* extent format more robust in face of on-disk corruption due to magics,
50* internal redunancy in tree 68* internal redunancy in tree
51 69* improved file allocation (multi-block alloc)
522.1 Previously available, soon to be enabled by default by "mkefs.ext4": 70* fix 32000 subdirectory limit
53 71* nsec timestamps for mtime, atime, ctime, create time
54* dir_index and resize inode will be on by default 72* inode version field on disk (NFSv4, Lustre)
55* large inodes will be used by default for fast EAs, nsec timestamps, etc 73* reduced e2fsck time via uninit_bg feature
74* journal checksumming for robustness, performance
75* persistent file preallocation (e.g for streaming media, databases)
76* ability to pack bitmaps and inode tables into larger virtual groups via the
77 flex_bg feature
78* large file support
79* Inode allocation using large virtual block groups via flex_bg
80* delayed allocation
81* large block (up to pagesize) support
82* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
83 the ordering)
56 84
572.2 Candidate features for future inclusion 852.2 Candidate features for future inclusion
58 86
59There are several under discussion, whether they all make it in is 87* Online defrag (patches available but not well tested)
60partly a function of how much time everyone has to work on them: 88* reduced mke2fs time via lazy itable initialization in conjuction with
89 the uninit_bg feature (capability to do this is available in e2fsprogs
90 but a kernel thread to do lazy zeroing of unused inode table blocks
91 after filesystem is first mounted is required for safety)
61 92
62* improved file allocation (multi-block alloc, delayed alloc; basically done) 93There are several others under discussion, whether they all make it in is
63* fix 32000 subdirectory limit (patch exists, needs some e2fsck work) 94partly a function of how much time everyone has to work on them. Features like
64* nsec timestamps for mtime, atime, ctime, create time (patch exists, 95metadata checksumming have been discussed and planned for a bit but no patches
65 needs some e2fsck work) 96exist yet so I'm not sure they're in the near-term roadmap.
66* inode version field on disk (NFSv4, Lustre; prototype exists)
67* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
68* journal checksumming for robustness, performance (prototype exists)
69* persistent file preallocation (e.g for streaming media, databases)
70 97
71Features like metadata checksumming have been discussed and planned for 98The big performance win will come with mballoc, delalloc and flex_bg
72a bit but no patches exist yet so I'm not sure they're in the near-term 99grouping of bitmaps and inode tables. Some test results available here:
73roadmap.
74 100
75The big performance win will come with mballoc and delalloc. CFS has 101 - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
76been using mballoc for a few years already with Lustre, and IBM + Bull 102 - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
77did a lot of benchmarking on it. The reason it isn't in the first set of
78patches is partly a manageability issue, and partly because it doesn't
79directly affect the on-disk format (outside of much better allocation)
80so it isn't critical to get into the first round of changes. I believe
81Alex is working on a new set of patches right now.
82 103
833. Options 1043. Options
84========== 105==========
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try
222 to use for allocation size and alignment. For RAID5/6 243 to use for allocation size and alignment. For RAID5/6
223 systems this should be the number of data 244 systems this should be the number of data
224 disks * RAID chunk size in file system blocks. 245 disks * RAID chunk size in file system blocks.
225 246delalloc (*) Deferring block allocation until write-out time.
247nodelalloc Disable delayed allocation. Blocks are allocation
248 when data is copied from user to page cache.
226Data Mode 249Data Mode
227--------- 250=========
228There are 3 different data modes: 251There are 3 different data modes:
229 252
230* writeback mode 253* writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
236 259
237* ordered mode 260* ordered mode
238In data=ordered mode, ext4 only officially journals metadata, but it logically 261In data=ordered mode, ext4 only officially journals metadata, but it logically
239groups metadata and data blocks into a single unit called a transaction. When 262groups metadata information related to data changes with the data blocks into a
240it's time to write the new metadata out to disk, the associated data blocks 263single unit called a transaction. When it's time to write the new metadata
241are written first. In general, this mode performs slightly slower than 264out to disk, the associated data blocks are written first. In general,
242writeback but significantly faster than journal mode. 265this mode performs slightly slower than writeback but significantly faster than journal mode.
243 266
244* journal mode 267* journal mode
245data=journal mode provides full data and metadata journaling. All new data is 268data=journal mode provides full data and metadata journaling. All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
247In the event of a crash, the journal can be replayed, bringing both data and 270In the event of a crash, the journal can be replayed, bringing both data and
248metadata into a consistent state. This mode is the slowest except when data 271metadata into a consistent state. This mode is the slowest except when data
249needs to be read from and written to disk at the same time where it 272needs to be read from and written to disk at the same time where it
250outperforms all others modes. 273outperforms all others modes. Curently ext4 does not have delayed
274allocation support if this data journalling mode is selected.
251 275
252References 276References
253========== 277==========
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/>
256 <file:fs/jbd2/> 280 <file:fs/jbd2/>
257 281
258programs: http://e2fsprogs.sourceforge.net/ 282programs: http://e2fsprogs.sourceforge.net/
259 http://ext2resize.sourceforge.net
260 283
261useful links: http://fedoraproject.org/wiki/ext3-devel 284useful links: http://fedoraproject.org/wiki/ext3-devel
262 http://www.bullopensource.org/ext4/ 285 http://www.bullopensource.org/ext4/
286 http://ext4.wiki.kernel.org/index.php/Main_Page
287 http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
new file mode 100644
index 000000000000..4dae9a3840bf
--- /dev/null
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -0,0 +1,114 @@
1 Glock internal locking rules
2 ------------------------------
3
4This documents the basic principles of the glock state machine
5internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h)
6has two main (internal) locks:
7
8 1. A spinlock (gl_spin) which protects the internal state such
9 as gl_state, gl_target and the list of holders (gl_holders)
10 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other
11 threads from making calls to the DLM, etc. at the same time. If a
12 thread takes this lock, it must then call run_queue (usually via the
13 workqueue) when it releases it in order to ensure any pending tasks
14 are completed.
15
16The gl_holders list contains all the queued lock requests (not
17just the holders) associated with the glock. If there are any
18held locks, then they will be contiguous entries at the head
19of the list. Locks are granted in strictly the order that they
20are queued, except for those marked LM_FLAG_PRIORITY which are
21used only during recovery, and even then only for journal locks.
22
23There are three lock states that users of the glock layer can request,
24namely shared (SH), deferred (DF) and exclusive (EX). Those translate
25to the following DLM lock modes:
26
27Glock mode | DLM lock mode
28------------------------------
29 UN | IV/NL Unlocked (no DLM lock associated with glock) or NL
30 SH | PR (Protected read)
31 DF | CW (Concurrent write)
32 EX | EX (Exclusive)
33
34Thus DF is basically a shared mode which is incompatible with the "normal"
35shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O
36operations. The glocks are basically a lock plus some routines which deal
37with cache management. The following rules apply for the cache:
38
39Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata
40--------------------------------------------------------------------------
41 UN | No | No | No | No
42 SH | Yes | Yes | No | No
43 DF | No | Yes | No | No
44 EX | Yes | Yes | Yes | Yes
45
46These rules are implemented using the various glock operations which
47are defined for each type of glock. Not all types of glocks use
48all the modes. Only inode glocks use the DF mode for example.
49
50Table of glock operations and per type constants:
51
52Field | Purpose
53----------------------------------------------------------------------------
54go_xmote_th | Called before remote state change (e.g. to sync dirty data)
55go_xmote_bh | Called after remote state change (e.g. to refill cache)
56go_inval | Called if remote state change requires invalidating the cache
57go_demote_ok | Returns boolean value of whether its ok to demote a glock
58 | (e.g. checks timeout, and that there is no cached data)
59go_lock | Called for the first local holder of a lock
60go_unlock | Called on the final local unlock of a lock
61go_dump | Called to print content of object for debugfs file, or on
62 | error to dump glock to the log.
63go_type; | The type of the glock, LM_TYPE_.....
64go_min_hold_time | The minimum hold time
65
66The minimum hold time for each lock is the time after a remote lock
67grant for which we ignore remote demote requests. This is in order to
68prevent a situation where locks are being bounced around the cluster
69from node to node with none of the nodes making any progress. This
70tends to show up most with shared mmaped files which are being written
71to by multiple nodes. By delaying the demotion in response to a
72remote callback, that gives the userspace program time to make
73some progress before the pages are unmapped.
74
75There is a plan to try and remove the go_lock and go_unlock callbacks
76if possible, in order to try and speed up the fast path though the locking.
77Also, eventually we hope to make the glock "EX" mode locally shared
78such that any local locking will be done with the i_mutex as required
79rather than via the glock.
80
81Locking rules for glock operations:
82
83Operation | GLF_LOCK bit lock held | gl_spin spinlock held
84-----------------------------------------------------------------
85go_xmote_th | Yes | No
86go_xmote_bh | Yes | No
87go_inval | Yes | No
88go_demote_ok | Sometimes | Yes
89go_lock | Yes | No
90go_unlock | Yes | No
91go_dump | Sometimes | Yes
92
93N.B. Operations must not drop either the bit lock or the spinlock
94if its held on entry. go_dump and do_demote_ok must never block.
95Note that go_dump will only be called if the glock's state
96indicates that it is caching uptodate data.
97
98Glock locking order within GFS2:
99
100 1. i_mutex (if required)
101 2. Rename glock (for rename only)
102 3. Inode glock(s)
103 (Parents before children, inodes at "same level" with same parent in
104 lock number order)
105 4. Rgrp glock(s) (for (de)allocation operations)
106 5. Transaction glock (via gfs2_trans_begin) for non-read operations
107 6. Page lock (always last, very important!)
108
109There are two glocks per inode. One deals with access to the inode
110itself (locking order as above), and the other, known as the iopen
111glock is used in conjunction with the i_nlink field in the inode to
112determine the lifetime of the inode in question. Locking of inodes
113is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
114
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dbc3c6a3650f..7f268f327d75 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -380,28 +380,35 @@ i386 and x86_64 platforms support the new IRQ vector displays.
380Of some interest is the introduction of the /proc/irq directory to 2.4. 380Of some interest is the introduction of the /proc/irq directory to 2.4.
381It could be used to set IRQ to CPU affinity, this means that you can "hook" an 381It could be used to set IRQ to CPU affinity, this means that you can "hook" an
382IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the 382IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
383irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask 383irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and
384prof_cpu_mask.
384 385
385For example 386For example
386 > ls /proc/irq/ 387 > ls /proc/irq/
387 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask 388 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask
388 1 11 13 15 17 19 3 5 7 9 389 1 11 13 15 17 19 3 5 7 9 default_smp_affinity
389 > ls /proc/irq/0/ 390 > ls /proc/irq/0/
390 smp_affinity 391 smp_affinity
391 392
392The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ 393smp_affinity is a bitmask, in which you can specify which CPUs can handle the
393is the same by default: 394IRQ, you can set it by doing:
394 395
395 > cat /proc/irq/0/smp_affinity 396 > echo 1 > /proc/irq/10/smp_affinity
396 ffffffff 397
398This means that only the first CPU will handle the IRQ, but you can also echo
3995 which means that only the first and fourth CPU can handle the IRQ.
397 400
398It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can 401The contents of each smp_affinity file is the same by default:
399set it by doing: 402
403 > cat /proc/irq/0/smp_affinity
404 ffffffff
400 405
401 > echo 1 > /proc/irq/prof_cpu_mask 406The default_smp_affinity mask applies to all non-active IRQs, which are the
407IRQs which have not yet been allocated/activated, and hence which lack a
408/proc/irq/[0-9]* directory.
402 409
403This means that only the first CPU will handle the IRQ, but you can also echo 5 410prof_cpu_mask specifies which CPUs are to be profiled by the system wide
404which means that only the first and fourth CPU can handle the IRQ. 411profiler. Default value is ffffffff (all cpus).
405 412
406The way IRQs are routed is handled by the IO-APIC, and it's Round Robin 413The way IRQs are routed is handled by the IO-APIC, and it's Round Robin
407between all the CPUs which are allowed to handle it. As usual the kernel has 414between all the CPUs which are allowed to handle it. As usual the kernel has
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
new file mode 100644
index 000000000000..77d3faa1a611
--- /dev/null
+++ b/Documentation/ftrace.txt
@@ -0,0 +1,1361 @@
1 ftrace - Function Tracer
2 ========================
3
4Copyright 2008 Red Hat Inc.
5 Author: Steven Rostedt <srostedt@redhat.com>
6 License: The GNU Free Documentation License, Version 1.2
7Reviewers: Elias Oltmanns and Randy Dunlap
8
9Writen for: 2.6.26-rc8 linux-2.6-tip.git tip/tracing/ftrace branch
10
11Introduction
12------------
13
14Ftrace is an internal tracer designed to help out developers and
15designers of systems to find what is going on inside the kernel.
16It can be used for debugging or analyzing latencies and performance
17issues that take place outside of user-space.
18
19Although ftrace is the function tracer, it also includes an
20infrastructure that allows for other types of tracing. Some of the
21tracers that are currently in ftrace is a tracer to trace
22context switches, the time it takes for a high priority task to
23run after it was woken up, the time interrupts are disabled, and
24more.
25
26
27The File System
28---------------
29
30Ftrace uses the debugfs file system to hold the control files as well
31as the files to display output.
32
33To mount the debugfs system:
34
35 # mkdir /debug
36 # mount -t debugfs nodev /debug
37
38
39That's it! (assuming that you have ftrace configured into your kernel)
40
41After mounting the debugfs, you can see a directory called
42"tracing". This directory contains the control and output files
43of ftrace. Here is a list of some of the key files:
44
45
46 Note: all time values are in microseconds.
47
48 current_tracer : This is used to set or display the current tracer
49 that is configured.
50
51 available_tracers : This holds the different types of tracers that
52 have been compiled into the kernel. The tracers
53 listed here can be configured by echoing in their
54 name into current_tracer.
55
56 tracing_enabled : This sets or displays whether the current_tracer
57 is activated and tracing or not. Echo 0 into this
58 file to disable the tracer or 1 (or non-zero) to
59 enable it.
60
61 trace : This file holds the output of the trace in a human readable
62 format.
63
64 latency_trace : This file shows the same trace but the information
65 is organized more to display possible latencies
66 in the system.
67
68 trace_pipe : The output is the same as the "trace" file but this
69 file is meant to be streamed with live tracing.
70 Reads from this file will block until new data
71 is retrieved. Unlike the "trace" and "latency_trace"
72 files, this file is a consumer. This means reading
73 from this file causes sequential reads to display
74 more current data. Once data is read from this
75 file, it is consumed, and will not be read
76 again with a sequential read. The "trace" and
77 "latency_trace" files are static, and if the
78 tracer isn't adding more data, they will display
79 the same information every time they are read.
80
81 iter_ctrl : This file lets the user control the amount of data
82 that is displayed in one of the above output
83 files.
84
85 trace_max_latency : Some of the tracers record the max latency.
86 For example, the time interrupts are disabled.
87 This time is saved in this file. The max trace
88 will also be stored, and displayed by either
89 "trace" or "latency_trace". A new max trace will
90 only be recorded if the latency is greater than
91 the value in this file. (in microseconds)
92
93 trace_entries : This sets or displays the number of trace
94 entries each CPU buffer can hold. The tracer buffers
95 are the same size for each CPU, so care must be
96 taken when modifying the trace_entries. The trace
97 buffers are allocated in pages (blocks of memory that
98 the kernel uses for allocation, usually 4 KB in size).
99 Since each entry is smaller than a page, if the last
100 allocated page has room for more entries than were
101 requested, the rest of the page is used to allocate
102 entries.
103
104 This can only be updated when the current_tracer
105 is set to "none".
106
107 NOTE: It is planned on changing the allocated buffers
108 from being the number of possible CPUS to
109 the number of online CPUS.
110
111 tracing_cpumask : This is a mask that lets the user only trace
112 on specified CPUS. The format is a hex string
113 representing the CPUS.
114
115 set_ftrace_filter : When dynamic ftrace is configured in, the
116 code is dynamically modified to disable calling
117 of the function profiler (mcount). This lets
118 tracing be configured in with practically no overhead
119 in performance. This also has a side effect of
120 enabling or disabling specific functions to be
121 traced. Echoing in names of functions into this
122 file will limit the trace to only these functions.
123
124 set_ftrace_notrace: This has the opposite effect that
125 set_ftrace_filter has. Any function that is added
126 here will not be traced. If a function exists
127 in both set_ftrace_filter and set_ftrace_notrace,
128 the function will _not_ be traced.
129
130 available_filter_functions : When a function is encountered the first
131 time by the dynamic tracer, it is recorded and
132 later the call is converted into a nop. This file
133 lists the functions that have been recorded
134 by the dynamic tracer and these functions can
135 be used to set the ftrace filter by the above
136 "set_ftrace_filter" file.
137
138
139The Tracers
140-----------
141
142Here are the list of current tracers that can be configured.
143
144 ftrace - function tracer that uses mcount to trace all functions.
145 It is possible to filter out which functions that are
146 to be traced when dynamic ftrace is configured in.
147
148 sched_switch - traces the context switches between tasks.
149
150 irqsoff - traces the areas that disable interrupts and saves off
151 the trace with the longest max latency.
152 See tracing_max_latency. When a new max is recorded,
153 it replaces the old trace. It is best to view this
154 trace with the latency_trace file.
155
156 preemptoff - Similar to irqsoff but traces and records the time
157 preemption is disabled.
158
159 preemptirqsoff - Similar to irqsoff and preemptoff, but traces and
160 records the largest time irqs and/or preemption is
161 disabled.
162
163 wakeup - Traces and records the max latency that it takes for
164 the highest priority task to get scheduled after
165 it has been woken up.
166
167 none - This is not a tracer. To remove all tracers from tracing
168 simply echo "none" into current_tracer.
169
170
171Examples of using the tracer
172----------------------------
173
174Here are typical examples of using the tracers with only controlling
175them with the debugfs interface (without using any user-land utilities).
176
177Output format:
178--------------
179
180Here's an example of the output format of the file "trace"
181
182 --------
183# tracer: ftrace
184#
185# TASK-PID CPU# TIMESTAMP FUNCTION
186# | | | | |
187 bash-4251 [01] 10152.583854: path_put <-path_walk
188 bash-4251 [01] 10152.583855: dput <-path_put
189 bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput
190 --------
191
192A header is printed with the trace that is represented. In this case
193the tracer is "ftrace". Then a header showing the format. Task name
194"bash", the task PID "4251", the CPU that it was running on
195"01", the timestamp in <secs>.<usecs> format, the function name that was
196traced "path_put" and the parent function that called this function
197"path_walk".
198
199The sched_switch tracer also includes tracing of task wake ups and
200context switches.
201
202 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S
203 ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S
204 ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R
205 events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R
206 kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R
207 ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R
208
209Wake ups are represented by a "+" and the context switches show
210"==>". The format is:
211
212 Context switches:
213
214 Previous task Next Task
215
216 <pid>:<prio>:<state> ==> <pid>:<prio>:<state>
217
218 Wake ups:
219
220 Current task Task waking up
221
222 <pid>:<prio>:<state> + <pid>:<prio>:<state>
223
224The prio is the internal kernel priority, which is inverse to the
225priority that is usually displayed by user-space tools. Zero represents
226the highest priority (99). Prio 100 starts the "nice" priorities with
227100 being equal to nice -20 and 139 being nice 19. The prio "140" is
228reserved for the idle task which is the lowest priority thread (pid 0).
229
230
231Latency trace format
232--------------------
233
234For traces that display latency times, the latency_trace file gives
235a bit more information to see why a latency happened. Here's a typical
236trace.
237
238# tracer: irqsoff
239#
240irqsoff latency trace v1.1.5 on 2.6.26-rc8
241--------------------------------------------------------------------
242 latency: 97 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
243 -----------------
244 | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
245 -----------------
246 => started at: apic_timer_interrupt
247 => ended at: do_softirq
248
249# _------=> CPU#
250# / _-----=> irqs-off
251# | / _----=> need-resched
252# || / _---=> hardirq/softirq
253# ||| / _--=> preempt-depth
254# |||| /
255# ||||| delay
256# cmd pid ||||| time | caller
257# \ / ||||| \ | /
258 <idle>-0 0d..1 0us+: trace_hardirqs_off_thunk (apic_timer_interrupt)
259 <idle>-0 0d.s. 97us : __do_softirq (do_softirq)
260 <idle>-0 0d.s1 98us : trace_hardirqs_on (do_softirq)
261
262
263vim:ft=help
264
265
266This shows that the current tracer is "irqsoff" tracing the time
267interrupts are disabled. It gives the trace version and the kernel
268this was executed on (2.6.26-rc8). Then it displays the max latency
269in microsecs (97 us). The number of trace entries displayed
270by the total number recorded (both are three: #3/3). The type of
271preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
272and reserved for later use. #P is the number of online CPUS (#P:2).
273
274The task is the process that was running when the latency happened.
275(swapper pid: 0).
276
277The start and stop that caused the latencies:
278
279 apic_timer_interrupt is where the interrupts were disabled.
280 do_softirq is where they were enabled again.
281
282The next lines after the header are the trace itself. The header
283explains which is which.
284
285 cmd: The name of the process in the trace.
286
287 pid: The PID of that process.
288
289 CPU#: The CPU that the process was running on.
290
291 irqs-off: 'd' interrupts are disabled. '.' otherwise.
292
293 need-resched: 'N' task need_resched is set, '.' otherwise.
294
295 hardirq/softirq:
296 'H' - hard irq happened inside a softirq.
297 'h' - hard irq is running
298 's' - soft irq is running
299 '.' - normal context.
300
301 preempt-depth: The level of preempt_disabled
302
303The above is mostly meaningful for kernel developers.
304
305 time: This differs from the trace file output. The trace file output
306 included an absolute timestamp. The timestamp used by the
307 latency_trace file is relative to the start of the trace.
308
309 delay: This is just to help catch your eye a bit better. And
310 needs to be fixed to be only relative to the same CPU.
311 The marks are determined by the difference between this
312 current trace and the next trace.
313 '!' - greater than preempt_mark_thresh (default 100)
314 '+' - greater than 1 microsecond
315 ' ' - less than or equal to 1 microsecond.
316
317 The rest is the same as the 'trace' file.
318
319
320iter_ctrl
321---------
322
323The iter_ctrl file is used to control what gets printed in the trace
324output. To see what is available, simply cat the file:
325
326 cat /debug/tracing/iter_ctrl
327 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
328 noblock nostacktrace nosched-tree
329
330To disable one of the options, echo in the option prepended with "no".
331
332 echo noprint-parent > /debug/tracing/iter_ctrl
333
334To enable an option, leave off the "no".
335
336 echo sym-offset > /debug/tracing/iter_ctrl
337
338Here are the available options:
339
340 print-parent - On function traces, display the calling function
341 as well as the function being traced.
342
343 print-parent:
344 bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul
345
346 noprint-parent:
347 bash-4000 [01] 1477.606694: simple_strtoul
348
349
350 sym-offset - Display not only the function name, but also the offset
351 in the function. For example, instead of seeing just
352 "ktime_get", you will see "ktime_get+0xb/0x20".
353
354 sym-offset:
355 bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0
356
357 sym-addr - this will also display the function address as well as
358 the function name.
359
360 sym-addr:
361 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
362
363 verbose - This deals with the latency_trace file.
364
365 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
366 (+0.000ms): simple_strtoul (strict_strtoul)
367
368 raw - This will display raw numbers. This option is best for use with
369 user applications that can translate the raw numbers better than
370 having it done in the kernel.
371
372 hex - Similar to raw, but the numbers will be in a hexadecimal format.
373
374 bin - This will print out the formats in raw binary.
375
376 block - TBD (needs update)
377
378 stacktrace - This is one of the options that changes the trace itself.
379 When a trace is recorded, so is the stack of functions.
380 This allows for back traces of trace sites.
381
382 sched-tree - TBD (any users??)
383
384
385sched_switch
386------------
387
388This tracer simply records schedule switches. Here's an example
389of how to use it.
390
391 # echo sched_switch > /debug/tracing/current_tracer
392 # echo 1 > /debug/tracing/tracing_enabled
393 # sleep 1
394 # echo 0 > /debug/tracing/tracing_enabled
395 # cat /debug/tracing/trace
396
397# tracer: sched_switch
398#
399# TASK-PID CPU# TIMESTAMP FUNCTION
400# | | | | |
401 bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R
402 bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R
403 sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R
404 bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S
405 bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R
406 sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R
407 bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D
408 bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R
409 <idle>-0 [00] 240.132589: 0:140:R + 4:115:S
410 <idle>-0 [00] 240.132591: 0:140:R ==> 4:115:R
411 ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R
412 <idle>-0 [00] 240.132598: 0:140:R + 4:115:S
413 <idle>-0 [00] 240.132599: 0:140:R ==> 4:115:R
414 ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R
415 sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R
416 [...]
417
418
419As we have discussed previously about this format, the header shows
420the name of the trace and points to the options. The "FUNCTION"
421is a misnomer since here it represents the wake ups and context
422switches.
423
424The sched_switch only lists the wake ups (represented with '+')
425and context switches ('==>') with the previous task or current
426first followed by the next task or task waking up. The format for both
427of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO
428is the inverse of the actual priority with zero (0) being the highest
429priority and the nice values starting at 100 (nice -20). Below is
430a quick chart to map the kernel priority to user land priorities.
431
432 Kernel priority: 0 to 99 ==> user RT priority 99 to 0
433 Kernel priority: 100 to 139 ==> user nice -20 to 19
434 Kernel priority: 140 ==> idle task priority
435
436The task states are:
437
438 R - running : wants to run, may not actually be running
439 S - sleep : process is waiting to be woken up (handles signals)
440 D - deep sleep : process must be woken up (ignores signals)
441 T - stopped : process suspended
442 t - traced : process is being traced (with something like gdb)
443 Z - zombie : process waiting to be cleaned up
444 X - unknown
445
446
447ftrace_enabled
448--------------
449
450The following tracers give different output depending on whether
451or not the sysctl ftrace_enabled is set. To set ftrace_enabled,
452one can either use the sysctl function or set it via the proc
453file system interface.
454
455 sysctl kernel.ftrace_enabled=1
456
457 or
458
459 echo 1 > /proc/sys/kernel/ftrace_enabled
460
461To disable ftrace_enabled simply replace the '1' with '0' in
462the above commands.
463
464When ftrace_enabled is set the tracers will also record the functions
465that are within the trace. The descriptions of the tracers
466will also show an example with ftrace enabled.
467
468
469irqsoff
470-------
471
472When interrupts are disabled, the CPU can not react to any other
473external event (besides NMIs and SMIs). This prevents the timer
474interrupt from triggering or the mouse interrupt from letting the
475kernel know of a new mouse event. The result is a latency with the
476reaction time.
477
478The irqsoff tracer tracks the time interrupts are disabled to the time
479they are re-enabled. When a new maximum latency is hit, it saves off
480the trace so that it may be retrieved at a later time. Every time a
481new maximum in reached, the old saved trace is discarded and the new
482trace is saved.
483
484To reset the maximum, echo 0 into tracing_max_latency. Here's an
485example:
486
487 # echo irqsoff > /debug/tracing/current_tracer
488 # echo 0 > /debug/tracing/tracing_max_latency
489 # echo 1 > /debug/tracing/tracing_enabled
490 # ls -ltr
491 [...]
492 # echo 0 > /debug/tracing/tracing_enabled
493 # cat /debug/tracing/latency_trace
494# tracer: irqsoff
495#
496irqsoff latency trace v1.1.5 on 2.6.26-rc8
497--------------------------------------------------------------------
498 latency: 6 us, #3/3, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
499 -----------------
500 | task: bash-4269 (uid:0 nice:0 policy:0 rt_prio:0)
501 -----------------
502 => started at: copy_page_range
503 => ended at: copy_page_range
504
505# _------=> CPU#
506# / _-----=> irqs-off
507# | / _----=> need-resched
508# || / _---=> hardirq/softirq
509# ||| / _--=> preempt-depth
510# |||| /
511# ||||| delay
512# cmd pid ||||| time | caller
513# \ / ||||| \ | /
514 bash-4269 1...1 0us+: _spin_lock (copy_page_range)
515 bash-4269 1...1 7us : _spin_unlock (copy_page_range)
516 bash-4269 1...2 7us : trace_preempt_on (copy_page_range)
517
518
519vim:ft=help
520
521Here we see that that we had a latency of 6 microsecs (which is
522very good). The spin_lock in copy_page_range disabled interrupts.
523The difference between the 6 and the displayed timestamp 7us is
524because the clock must have incremented between the time of recording
525the max latency and recording the function that had that latency.
526
527Note the above had ftrace_enabled not set. If we set the ftrace_enabled,
528we get a much larger output:
529
530# tracer: irqsoff
531#
532irqsoff latency trace v1.1.5 on 2.6.26-rc8
533--------------------------------------------------------------------
534 latency: 50 us, #101/101, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
535 -----------------
536 | task: ls-4339 (uid:0 nice:0 policy:0 rt_prio:0)
537 -----------------
538 => started at: __alloc_pages_internal
539 => ended at: __alloc_pages_internal
540
541# _------=> CPU#
542# / _-----=> irqs-off
543# | / _----=> need-resched
544# || / _---=> hardirq/softirq
545# ||| / _--=> preempt-depth
546# |||| /
547# ||||| delay
548# cmd pid ||||| time | caller
549# \ / ||||| \ | /
550 ls-4339 0...1 0us+: get_page_from_freelist (__alloc_pages_internal)
551 ls-4339 0d..1 3us : rmqueue_bulk (get_page_from_freelist)
552 ls-4339 0d..1 3us : _spin_lock (rmqueue_bulk)
553 ls-4339 0d..1 4us : add_preempt_count (_spin_lock)
554 ls-4339 0d..2 4us : __rmqueue (rmqueue_bulk)
555 ls-4339 0d..2 5us : __rmqueue_smallest (__rmqueue)
556 ls-4339 0d..2 5us : __mod_zone_page_state (__rmqueue_smallest)
557 ls-4339 0d..2 6us : __rmqueue (rmqueue_bulk)
558 ls-4339 0d..2 6us : __rmqueue_smallest (__rmqueue)
559 ls-4339 0d..2 7us : __mod_zone_page_state (__rmqueue_smallest)
560 ls-4339 0d..2 7us : __rmqueue (rmqueue_bulk)
561 ls-4339 0d..2 8us : __rmqueue_smallest (__rmqueue)
562[...]
563 ls-4339 0d..2 46us : __rmqueue_smallest (__rmqueue)
564 ls-4339 0d..2 47us : __mod_zone_page_state (__rmqueue_smallest)
565 ls-4339 0d..2 47us : __rmqueue (rmqueue_bulk)
566 ls-4339 0d..2 48us : __rmqueue_smallest (__rmqueue)
567 ls-4339 0d..2 48us : __mod_zone_page_state (__rmqueue_smallest)
568 ls-4339 0d..2 49us : _spin_unlock (rmqueue_bulk)
569 ls-4339 0d..2 49us : sub_preempt_count (_spin_unlock)
570 ls-4339 0d..1 50us : get_page_from_freelist (__alloc_pages_internal)
571 ls-4339 0d..2 51us : trace_hardirqs_on (__alloc_pages_internal)
572
573
574vim:ft=help
575
576
577Here we traced a 50 microsecond latency. But we also see all the
578functions that were called during that time. Note that by enabling
579function tracing, we endure an added overhead. This overhead may
580extend the latency times. But nevertheless, this trace has provided
581some very helpful debugging information.
582
583
584preemptoff
585----------
586
587When preemption is disabled, we may be able to receive interrupts but
588the task cannot be preempted and a higher priority task must wait
589for preemption to be enabled again before it can preempt a lower
590priority task.
591
592The preemptoff tracer traces the places that disable preemption.
593Like the irqsoff, it records the maximum latency that preemption
594was disabled. The control of preemptoff is much like the irqsoff.
595
596 # echo preemptoff > /debug/tracing/current_tracer
597 # echo 0 > /debug/tracing/tracing_max_latency
598 # echo 1 > /debug/tracing/tracing_enabled
599 # ls -ltr
600 [...]
601 # echo 0 > /debug/tracing/tracing_enabled
602 # cat /debug/tracing/latency_trace
603# tracer: preemptoff
604#
605preemptoff latency trace v1.1.5 on 2.6.26-rc8
606--------------------------------------------------------------------
607 latency: 29 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
608 -----------------
609 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
610 -----------------
611 => started at: do_IRQ
612 => ended at: __do_softirq
613
614# _------=> CPU#
615# / _-----=> irqs-off
616# | / _----=> need-resched
617# || / _---=> hardirq/softirq
618# ||| / _--=> preempt-depth
619# |||| /
620# ||||| delay
621# cmd pid ||||| time | caller
622# \ / ||||| \ | /
623 sshd-4261 0d.h. 0us+: irq_enter (do_IRQ)
624 sshd-4261 0d.s. 29us : _local_bh_enable (__do_softirq)
625 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq)
626
627
628vim:ft=help
629
630This has some more changes. Preemption was disabled when an interrupt
631came in (notice the 'h'), and was enabled while doing a softirq.
632(notice the 's'). But we also see that interrupts have been disabled
633when entering the preempt off section and leaving it (the 'd').
634We do not know if interrupts were enabled in the mean time.
635
636# tracer: preemptoff
637#
638preemptoff latency trace v1.1.5 on 2.6.26-rc8
639--------------------------------------------------------------------
640 latency: 63 us, #87/87, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
641 -----------------
642 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
643 -----------------
644 => started at: remove_wait_queue
645 => ended at: __do_softirq
646
647# _------=> CPU#
648# / _-----=> irqs-off
649# | / _----=> need-resched
650# || / _---=> hardirq/softirq
651# ||| / _--=> preempt-depth
652# |||| /
653# ||||| delay
654# cmd pid ||||| time | caller
655# \ / ||||| \ | /
656 sshd-4261 0d..1 0us : _spin_lock_irqsave (remove_wait_queue)
657 sshd-4261 0d..1 1us : _spin_unlock_irqrestore (remove_wait_queue)
658 sshd-4261 0d..1 2us : do_IRQ (common_interrupt)
659 sshd-4261 0d..1 2us : irq_enter (do_IRQ)
660 sshd-4261 0d..1 2us : idle_cpu (irq_enter)
661 sshd-4261 0d..1 3us : add_preempt_count (irq_enter)
662 sshd-4261 0d.h1 3us : idle_cpu (irq_enter)
663 sshd-4261 0d.h. 4us : handle_fasteoi_irq (do_IRQ)
664[...]
665 sshd-4261 0d.h. 12us : add_preempt_count (_spin_lock)
666 sshd-4261 0d.h1 12us : ack_ioapic_quirk_irq (handle_fasteoi_irq)
667 sshd-4261 0d.h1 13us : move_native_irq (ack_ioapic_quirk_irq)
668 sshd-4261 0d.h1 13us : _spin_unlock (handle_fasteoi_irq)
669 sshd-4261 0d.h1 14us : sub_preempt_count (_spin_unlock)
670 sshd-4261 0d.h1 14us : irq_exit (do_IRQ)
671 sshd-4261 0d.h1 15us : sub_preempt_count (irq_exit)
672 sshd-4261 0d..2 15us : do_softirq (irq_exit)
673 sshd-4261 0d... 15us : __do_softirq (do_softirq)
674 sshd-4261 0d... 16us : __local_bh_disable (__do_softirq)
675 sshd-4261 0d... 16us+: add_preempt_count (__local_bh_disable)
676 sshd-4261 0d.s4 20us : add_preempt_count (__local_bh_disable)
677 sshd-4261 0d.s4 21us : sub_preempt_count (local_bh_enable)
678 sshd-4261 0d.s5 21us : sub_preempt_count (local_bh_enable)
679[...]
680 sshd-4261 0d.s6 41us : add_preempt_count (__local_bh_disable)
681 sshd-4261 0d.s6 42us : sub_preempt_count (local_bh_enable)
682 sshd-4261 0d.s7 42us : sub_preempt_count (local_bh_enable)
683 sshd-4261 0d.s5 43us : add_preempt_count (__local_bh_disable)
684 sshd-4261 0d.s5 43us : sub_preempt_count (local_bh_enable_ip)
685 sshd-4261 0d.s6 44us : sub_preempt_count (local_bh_enable_ip)
686 sshd-4261 0d.s5 44us : add_preempt_count (__local_bh_disable)
687 sshd-4261 0d.s5 45us : sub_preempt_count (local_bh_enable)
688[...]
689 sshd-4261 0d.s. 63us : _local_bh_enable (__do_softirq)
690 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq)
691
692
693The above is an example of the preemptoff trace with ftrace_enabled
694set. Here we see that interrupts were disabled the entire time.
695The irq_enter code lets us know that we entered an interrupt 'h'.
696Before that, the functions being traced still show that it is not
697in an interrupt, but we can see by the functions themselves that
698this is not the case.
699
700Notice that the __do_softirq when called doesn't have a preempt_count.
701It may seem that we missed a preempt enabled. What really happened
702is that the preempt count is held on the threads stack and we
703switched to the softirq stack (4K stacks in effect). The code
704does not copy the preempt count, but because interrupts are disabled,
705we don't need to worry about it. Having a tracer like this is good
706to let people know what really happens inside the kernel.
707
708
709preemptirqsoff
710--------------
711
712Knowing the locations that have interrupts disabled or preemption
713disabled for the longest times is helpful. But sometimes we would
714like to know when either preemption and/or interrupts are disabled.
715
716The following code:
717
718 local_irq_disable();
719 call_function_with_irqs_off();
720 preempt_disable();
721 call_function_with_irqs_and_preemption_off();
722 local_irq_enable();
723 call_function_with_preemption_off();
724 preempt_enable();
725
726The irqsoff tracer will record the total length of
727call_function_with_irqs_off() and
728call_function_with_irqs_and_preemption_off().
729
730The preemptoff tracer will record the total length of
731call_function_with_irqs_and_preemption_off() and
732call_function_with_preemption_off().
733
734But neither will trace the time that interrupts and/or preemption
735is disabled. This total time is the time that we can not schedule.
736To record this time, use the preemptirqsoff tracer.
737
738Again, using this trace is much like the irqsoff and preemptoff tracers.
739
740 # echo preemptirqsoff > /debug/tracing/current_tracer
741 # echo 0 > /debug/tracing/tracing_max_latency
742 # echo 1 > /debug/tracing/tracing_enabled
743 # ls -ltr
744 [...]
745 # echo 0 > /debug/tracing/tracing_enabled
746 # cat /debug/tracing/latency_trace
747# tracer: preemptirqsoff
748#
749preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
750--------------------------------------------------------------------
751 latency: 293 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
752 -----------------
753 | task: ls-4860 (uid:0 nice:0 policy:0 rt_prio:0)
754 -----------------
755 => started at: apic_timer_interrupt
756 => ended at: __do_softirq
757
758# _------=> CPU#
759# / _-----=> irqs-off
760# | / _----=> need-resched
761# || / _---=> hardirq/softirq
762# ||| / _--=> preempt-depth
763# |||| /
764# ||||| delay
765# cmd pid ||||| time | caller
766# \ / ||||| \ | /
767 ls-4860 0d... 0us!: trace_hardirqs_off_thunk (apic_timer_interrupt)
768 ls-4860 0d.s. 294us : _local_bh_enable (__do_softirq)
769 ls-4860 0d.s1 294us : trace_preempt_on (__do_softirq)
770
771
772vim:ft=help
773
774
775The trace_hardirqs_off_thunk is called from assembly on x86 when
776interrupts are disabled in the assembly code. Without the function
777tracing, we don't know if interrupts were enabled within the preemption
778points. We do see that it started with preemption enabled.
779
780Here is a trace with ftrace_enabled set:
781
782
783# tracer: preemptirqsoff
784#
785preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
786--------------------------------------------------------------------
787 latency: 105 us, #183/183, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
788 -----------------
789 | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
790 -----------------
791 => started at: write_chan
792 => ended at: __do_softirq
793
794# _------=> CPU#
795# / _-----=> irqs-off
796# | / _----=> need-resched
797# || / _---=> hardirq/softirq
798# ||| / _--=> preempt-depth
799# |||| /
800# ||||| delay
801# cmd pid ||||| time | caller
802# \ / ||||| \ | /
803 ls-4473 0.N.. 0us : preempt_schedule (write_chan)
804 ls-4473 0dN.1 1us : _spin_lock (schedule)
805 ls-4473 0dN.1 2us : add_preempt_count (_spin_lock)
806 ls-4473 0d..2 2us : put_prev_task_fair (schedule)
807[...]
808 ls-4473 0d..2 13us : set_normalized_timespec (ktime_get_ts)
809 ls-4473 0d..2 13us : __switch_to (schedule)
810 sshd-4261 0d..2 14us : finish_task_switch (schedule)
811 sshd-4261 0d..2 14us : _spin_unlock_irq (finish_task_switch)
812 sshd-4261 0d..1 15us : add_preempt_count (_spin_lock_irqsave)
813 sshd-4261 0d..2 16us : _spin_unlock_irqrestore (hrtick_set)
814 sshd-4261 0d..2 16us : do_IRQ (common_interrupt)
815 sshd-4261 0d..2 17us : irq_enter (do_IRQ)
816 sshd-4261 0d..2 17us : idle_cpu (irq_enter)
817 sshd-4261 0d..2 18us : add_preempt_count (irq_enter)
818 sshd-4261 0d.h2 18us : idle_cpu (irq_enter)
819 sshd-4261 0d.h. 18us : handle_fasteoi_irq (do_IRQ)
820 sshd-4261 0d.h. 19us : _spin_lock (handle_fasteoi_irq)
821 sshd-4261 0d.h. 19us : add_preempt_count (_spin_lock)
822 sshd-4261 0d.h1 20us : _spin_unlock (handle_fasteoi_irq)
823 sshd-4261 0d.h1 20us : sub_preempt_count (_spin_unlock)
824[...]
825 sshd-4261 0d.h1 28us : _spin_unlock (handle_fasteoi_irq)
826 sshd-4261 0d.h1 29us : sub_preempt_count (_spin_unlock)
827 sshd-4261 0d.h2 29us : irq_exit (do_IRQ)
828 sshd-4261 0d.h2 29us : sub_preempt_count (irq_exit)
829 sshd-4261 0d..3 30us : do_softirq (irq_exit)
830 sshd-4261 0d... 30us : __do_softirq (do_softirq)
831 sshd-4261 0d... 31us : __local_bh_disable (__do_softirq)
832 sshd-4261 0d... 31us+: add_preempt_count (__local_bh_disable)
833 sshd-4261 0d.s4 34us : add_preempt_count (__local_bh_disable)
834[...]
835 sshd-4261 0d.s3 43us : sub_preempt_count (local_bh_enable_ip)
836 sshd-4261 0d.s4 44us : sub_preempt_count (local_bh_enable_ip)
837 sshd-4261 0d.s3 44us : smp_apic_timer_interrupt (apic_timer_interrupt)
838 sshd-4261 0d.s3 45us : irq_enter (smp_apic_timer_interrupt)
839 sshd-4261 0d.s3 45us : idle_cpu (irq_enter)
840 sshd-4261 0d.s3 46us : add_preempt_count (irq_enter)
841 sshd-4261 0d.H3 46us : idle_cpu (irq_enter)
842 sshd-4261 0d.H3 47us : hrtimer_interrupt (smp_apic_timer_interrupt)
843 sshd-4261 0d.H3 47us : ktime_get (hrtimer_interrupt)
844[...]
845 sshd-4261 0d.H3 81us : tick_program_event (hrtimer_interrupt)
846 sshd-4261 0d.H3 82us : ktime_get (tick_program_event)
847 sshd-4261 0d.H3 82us : ktime_get_ts (ktime_get)
848 sshd-4261 0d.H3 83us : getnstimeofday (ktime_get_ts)
849 sshd-4261 0d.H3 83us : set_normalized_timespec (ktime_get_ts)
850 sshd-4261 0d.H3 84us : clockevents_program_event (tick_program_event)
851 sshd-4261 0d.H3 84us : lapic_next_event (clockevents_program_event)
852 sshd-4261 0d.H3 85us : irq_exit (smp_apic_timer_interrupt)
853 sshd-4261 0d.H3 85us : sub_preempt_count (irq_exit)
854 sshd-4261 0d.s4 86us : sub_preempt_count (irq_exit)
855 sshd-4261 0d.s3 86us : add_preempt_count (__local_bh_disable)
856[...]
857 sshd-4261 0d.s1 98us : sub_preempt_count (net_rx_action)
858 sshd-4261 0d.s. 99us : add_preempt_count (_spin_lock_irq)
859 sshd-4261 0d.s1 99us+: _spin_unlock_irq (run_timer_softirq)
860 sshd-4261 0d.s. 104us : _local_bh_enable (__do_softirq)
861 sshd-4261 0d.s. 104us : sub_preempt_count (_local_bh_enable)
862 sshd-4261 0d.s. 105us : _local_bh_enable (__do_softirq)
863 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq)
864
865
866This is a very interesting trace. It started with the preemption of
867the ls task. We see that the task had the "need_resched" bit set
868with the 'N' in the trace. Interrupts are disabled in the spin_lock
869and the trace started. We see that a schedule took place to run
870sshd. When the interrupts were enabled, we took an interrupt.
871On return from the interrupt handler, the softirq ran. We took another
872interrupt while running the softirq as we see with the capital 'H'.
873
874
875wakeup
876------
877
878In Real-Time environment it is very important to know the wakeup
879time it takes for the highest priority task that wakes up to the
880time it executes. This is also known as "schedule latency".
881I stress the point that this is about RT tasks. It is also important
882to know the scheduling latency of non-RT tasks, but the average
883schedule latency is better for non-RT tasks. Tools like
884LatencyTop are more appropriate for such measurements.
885
886Real-Time environments are interested in the worst case latency.
887That is the longest latency it takes for something to happen, and
888not the average. We can have a very fast scheduler that may only
889have a large latency once in a while, but that would not work well
890with Real-Time tasks. The wakeup tracer was designed to record
891the worst case wakeups of RT tasks. Non-RT tasks are not recorded
892because the tracer only records one worst case and tracing non-RT
893tasks that are unpredictable will overwrite the worst case latency
894of RT tasks.
895
896Since this tracer only deals with RT tasks, we will run this slightly
897differently than we did with the previous tracers. Instead of performing
898an 'ls', we will run 'sleep 1' under 'chrt' which changes the
899priority of the task.
900
901 # echo wakeup > /debug/tracing/current_tracer
902 # echo 0 > /debug/tracing/tracing_max_latency
903 # echo 1 > /debug/tracing/tracing_enabled
904 # chrt -f 5 sleep 1
905 # echo 0 > /debug/tracing/tracing_enabled
906 # cat /debug/tracing/latency_trace
907# tracer: wakeup
908#
909wakeup latency trace v1.1.5 on 2.6.26-rc8
910--------------------------------------------------------------------
911 latency: 4 us, #2/2, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
912 -----------------
913 | task: sleep-4901 (uid:0 nice:0 policy:1 rt_prio:5)
914 -----------------
915
916# _------=> CPU#
917# / _-----=> irqs-off
918# | / _----=> need-resched
919# || / _---=> hardirq/softirq
920# ||| / _--=> preempt-depth
921# |||| /
922# ||||| delay
923# cmd pid ||||| time | caller
924# \ / ||||| \ | /
925 <idle>-0 1d.h4 0us+: try_to_wake_up (wake_up_process)
926 <idle>-0 1d..4 4us : schedule (cpu_idle)
927
928
929vim:ft=help
930
931
932Running this on an idle system, we see that it only took 4 microseconds
933to perform the task switch. Note, since the trace marker in the
934schedule is before the actual "switch", we stop the tracing when
935the recorded task is about to schedule in. This may change if
936we add a new marker at the end of the scheduler.
937
938Notice that the recorded task is 'sleep' with the PID of 4901 and it
939has an rt_prio of 5. This priority is user-space priority and not
940the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
941for SCHED_RR.
942
943Doing the same with chrt -r 5 and ftrace_enabled set.
944
945# tracer: wakeup
946#
947wakeup latency trace v1.1.5 on 2.6.26-rc8
948--------------------------------------------------------------------
949 latency: 50 us, #60/60, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
950 -----------------
951 | task: sleep-4068 (uid:0 nice:0 policy:2 rt_prio:5)
952 -----------------
953
954# _------=> CPU#
955# / _-----=> irqs-off
956# | / _----=> need-resched
957# || / _---=> hardirq/softirq
958# ||| / _--=> preempt-depth
959# |||| /
960# ||||| delay
961# cmd pid ||||| time | caller
962# \ / ||||| \ | /
963ksoftirq-7 1d.H3 0us : try_to_wake_up (wake_up_process)
964ksoftirq-7 1d.H4 1us : sub_preempt_count (marker_probe_cb)
965ksoftirq-7 1d.H3 2us : check_preempt_wakeup (try_to_wake_up)
966ksoftirq-7 1d.H3 3us : update_curr (check_preempt_wakeup)
967ksoftirq-7 1d.H3 4us : calc_delta_mine (update_curr)
968ksoftirq-7 1d.H3 5us : __resched_task (check_preempt_wakeup)
969ksoftirq-7 1d.H3 6us : task_wake_up_rt (try_to_wake_up)
970ksoftirq-7 1d.H3 7us : _spin_unlock_irqrestore (try_to_wake_up)
971[...]
972ksoftirq-7 1d.H2 17us : irq_exit (smp_apic_timer_interrupt)
973ksoftirq-7 1d.H2 18us : sub_preempt_count (irq_exit)
974ksoftirq-7 1d.s3 19us : sub_preempt_count (irq_exit)
975ksoftirq-7 1..s2 20us : rcu_process_callbacks (__do_softirq)
976[...]
977ksoftirq-7 1..s2 26us : __rcu_process_callbacks (rcu_process_callbacks)
978ksoftirq-7 1d.s2 27us : _local_bh_enable (__do_softirq)
979ksoftirq-7 1d.s2 28us : sub_preempt_count (_local_bh_enable)
980ksoftirq-7 1.N.3 29us : sub_preempt_count (ksoftirqd)
981ksoftirq-7 1.N.2 30us : _cond_resched (ksoftirqd)
982ksoftirq-7 1.N.2 31us : __cond_resched (_cond_resched)
983ksoftirq-7 1.N.2 32us : add_preempt_count (__cond_resched)
984ksoftirq-7 1.N.2 33us : schedule (__cond_resched)
985ksoftirq-7 1.N.2 33us : add_preempt_count (schedule)
986ksoftirq-7 1.N.3 34us : hrtick_clear (schedule)
987ksoftirq-7 1dN.3 35us : _spin_lock (schedule)
988ksoftirq-7 1dN.3 36us : add_preempt_count (_spin_lock)
989ksoftirq-7 1d..4 37us : put_prev_task_fair (schedule)
990ksoftirq-7 1d..4 38us : update_curr (put_prev_task_fair)
991[...]
992ksoftirq-7 1d..5 47us : _spin_trylock (tracing_record_cmdline)
993ksoftirq-7 1d..5 48us : add_preempt_count (_spin_trylock)
994ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline)
995ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock)
996ksoftirq-7 1d..4 50us : schedule (__cond_resched)
997
998The interrupt went off while running ksoftirqd. This task runs at
999SCHED_OTHER. Why didn't we see the 'N' set early? This may be
1000a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks
1001configured, the interrupt and softirq runs with their own stack.
1002Some information is held on the top of the task's stack (need_resched
1003and preempt_count are both stored there). The setting of the NEED_RESCHED
1004bit is done directly to the task's stack, but the reading of the
1005NEED_RESCHED is done by looking at the current stack, which in this case
1006is the stack for the hard interrupt. This hides the fact that NEED_RESCHED
1007has been set. We don't see the 'N' until we switch back to the task's
1008assigned stack.
1009
1010ftrace
1011------
1012
1013ftrace is not only the name of the tracing infrastructure, but it
1014is also a name of one of the tracers. The tracer is the function
1015tracer. Enabling the function tracer can be done from the
1016debug file system. Make sure the ftrace_enabled is set otherwise
1017this tracer is a nop.
1018
1019 # sysctl kernel.ftrace_enabled=1
1020 # echo ftrace > /debug/tracing/current_tracer
1021 # echo 1 > /debug/tracing/tracing_enabled
1022 # usleep 1
1023 # echo 0 > /debug/tracing/tracing_enabled
1024 # cat /debug/tracing/trace
1025# tracer: ftrace
1026#
1027# TASK-PID CPU# TIMESTAMP FUNCTION
1028# | | | | |
1029 bash-4003 [00] 123.638713: finish_task_switch <-schedule
1030 bash-4003 [00] 123.638714: _spin_unlock_irq <-finish_task_switch
1031 bash-4003 [00] 123.638714: sub_preempt_count <-_spin_unlock_irq
1032 bash-4003 [00] 123.638715: hrtick_set <-schedule
1033 bash-4003 [00] 123.638715: _spin_lock_irqsave <-hrtick_set
1034 bash-4003 [00] 123.638716: add_preempt_count <-_spin_lock_irqsave
1035 bash-4003 [00] 123.638716: _spin_unlock_irqrestore <-hrtick_set
1036 bash-4003 [00] 123.638717: sub_preempt_count <-_spin_unlock_irqrestore
1037 bash-4003 [00] 123.638717: hrtick_clear <-hrtick_set
1038 bash-4003 [00] 123.638718: sub_preempt_count <-schedule
1039 bash-4003 [00] 123.638718: sub_preempt_count <-preempt_schedule
1040 bash-4003 [00] 123.638719: wait_for_completion <-__stop_machine_run
1041 bash-4003 [00] 123.638719: wait_for_common <-wait_for_completion
1042 bash-4003 [00] 123.638720: _spin_lock_irq <-wait_for_common
1043 bash-4003 [00] 123.638720: add_preempt_count <-_spin_lock_irq
1044[...]
1045
1046
1047Note: It is sometimes better to enable or disable tracing directly from
1048a program, because the buffer may be overflowed by the echo commands
1049before you get to the point you want to trace. It is also easier to
1050stop the tracing at the point that you hit the part that you are
1051interested in. Since the ftrace buffer is a ring buffer with the
1052oldest data being overwritten, usually it is sufficient to start the
1053tracer with an echo command but have you code stop it. Something
1054like the following is usually appropriate for this.
1055
1056int trace_fd;
1057[...]
1058int main(int argc, char *argv[]) {
1059 [...]
1060 trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY);
1061 [...]
1062 if (condition_hit()) {
1063 write(trace_fd, "0", 1);
1064 }
1065 [...]
1066}
1067
1068
1069dynamic ftrace
1070--------------
1071
1072If CONFIG_DYNAMIC_FTRACE is set, then the system will run with
1073virtually no overhead when function tracing is disabled. The way
1074this works is the mcount function call (placed at the start of
1075every kernel function, produced by the -pg switch in gcc), starts
1076of pointing to a simple return.
1077
1078When dynamic ftrace is initialized, it calls kstop_machine to make
1079the machine act like a uniprocessor so that it can freely modify code
1080without worrying about other processors executing that same code. At
1081initialization, the mcount calls are changed to call a "record_ip"
1082function. After this, the first time a kernel function is called,
1083it has the calling address saved in a hash table.
1084
1085Later on the ftraced kernel thread is awoken and will again call
1086kstop_machine if new functions have been recorded. The ftraced thread
1087will change all calls to mcount to "nop". Just calling mcount
1088and having mcount return has shown a 10% overhead. By converting
1089it to a nop, there is no recordable overhead to the system.
1090
1091One special side-effect to the recording of the functions being
1092traced, is that we can now selectively choose which functions we
1093want to trace and which ones we want the mcount calls to remain as
1094nops.
1095
1096Two files are used, one for enabling and one for disabling the tracing
1097of recorded functions. They are:
1098
1099 set_ftrace_filter
1100
1101and
1102
1103 set_ftrace_notrace
1104
1105A list of available functions that you can add to these files is listed
1106in:
1107
1108 available_filter_functions
1109
1110 # cat /debug/tracing/available_filter_functions
1111put_prev_task_idle
1112kmem_cache_create
1113pick_next_task_rt
1114get_online_cpus
1115pick_next_task_fair
1116mutex_lock
1117[...]
1118
1119If I'm only interested in sys_nanosleep and hrtimer_interrupt:
1120
1121 # echo sys_nanosleep hrtimer_interrupt \
1122 > /debug/tracing/set_ftrace_filter
1123 # echo ftrace > /debug/tracing/current_tracer
1124 # echo 1 > /debug/tracing/tracing_enabled
1125 # usleep 1
1126 # echo 0 > /debug/tracing/tracing_enabled
1127 # cat /debug/tracing/trace
1128# tracer: ftrace
1129#
1130# TASK-PID CPU# TIMESTAMP FUNCTION
1131# | | | | |
1132 usleep-4134 [00] 1317.070017: hrtimer_interrupt <-smp_apic_timer_interrupt
1133 usleep-4134 [00] 1317.070111: sys_nanosleep <-syscall_call
1134 <idle>-0 [00] 1317.070115: hrtimer_interrupt <-smp_apic_timer_interrupt
1135
1136To see what functions are being traced, you can cat the file:
1137
1138 # cat /debug/tracing/set_ftrace_filter
1139hrtimer_interrupt
1140sys_nanosleep
1141
1142
1143Perhaps this isn't enough. The filters also allow simple wild cards.
1144Only the following are currently available
1145
1146 <match>* - will match functions that begin with <match>
1147 *<match> - will match functions that end with <match>
1148 *<match>* - will match functions that have <match> in it
1149
1150Thats all the wild cards that are allowed.
1151
1152 <match>*<match> will not work.
1153
1154 # echo hrtimer_* > /debug/tracing/set_ftrace_filter
1155
1156Produces:
1157
1158# tracer: ftrace
1159#
1160# TASK-PID CPU# TIMESTAMP FUNCTION
1161# | | | | |
1162 bash-4003 [00] 1480.611794: hrtimer_init <-copy_process
1163 bash-4003 [00] 1480.611941: hrtimer_start <-hrtick_set
1164 bash-4003 [00] 1480.611956: hrtimer_cancel <-hrtick_clear
1165 bash-4003 [00] 1480.611956: hrtimer_try_to_cancel <-hrtimer_cancel
1166 <idle>-0 [00] 1480.612019: hrtimer_get_next_event <-get_next_timer_interrupt
1167 <idle>-0 [00] 1480.612025: hrtimer_get_next_event <-get_next_timer_interrupt
1168 <idle>-0 [00] 1480.612032: hrtimer_get_next_event <-get_next_timer_interrupt
1169 <idle>-0 [00] 1480.612037: hrtimer_get_next_event <-get_next_timer_interrupt
1170 <idle>-0 [00] 1480.612382: hrtimer_get_next_event <-get_next_timer_interrupt
1171
1172
1173Notice that we lost the sys_nanosleep.
1174
1175 # cat /debug/tracing/set_ftrace_filter
1176hrtimer_run_queues
1177hrtimer_run_pending
1178hrtimer_init
1179hrtimer_cancel
1180hrtimer_try_to_cancel
1181hrtimer_forward
1182hrtimer_start
1183hrtimer_reprogram
1184hrtimer_force_reprogram
1185hrtimer_get_next_event
1186hrtimer_interrupt
1187hrtimer_nanosleep
1188hrtimer_wakeup
1189hrtimer_get_remaining
1190hrtimer_get_res
1191hrtimer_init_sleeper
1192
1193
1194This is because the '>' and '>>' act just like they do in bash.
1195To rewrite the filters, use '>'
1196To append to the filters, use '>>'
1197
1198To clear out a filter so that all functions will be recorded again:
1199
1200 # echo > /debug/tracing/set_ftrace_filter
1201 # cat /debug/tracing/set_ftrace_filter
1202 #
1203
1204Again, now we want to append.
1205
1206 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
1207 # cat /debug/tracing/set_ftrace_filter
1208sys_nanosleep
1209 # echo hrtimer_* >> /debug/tracing/set_ftrace_filter
1210 # cat /debug/tracing/set_ftrace_filter
1211hrtimer_run_queues
1212hrtimer_run_pending
1213hrtimer_init
1214hrtimer_cancel
1215hrtimer_try_to_cancel
1216hrtimer_forward
1217hrtimer_start
1218hrtimer_reprogram
1219hrtimer_force_reprogram
1220hrtimer_get_next_event
1221hrtimer_interrupt
1222sys_nanosleep
1223hrtimer_nanosleep
1224hrtimer_wakeup
1225hrtimer_get_remaining
1226hrtimer_get_res
1227hrtimer_init_sleeper
1228
1229
1230The set_ftrace_notrace prevents those functions from being traced.
1231
1232 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
1233
1234Produces:
1235
1236# tracer: ftrace
1237#
1238# TASK-PID CPU# TIMESTAMP FUNCTION
1239# | | | | |
1240 bash-4043 [01] 115.281644: finish_task_switch <-schedule
1241 bash-4043 [01] 115.281645: hrtick_set <-schedule
1242 bash-4043 [01] 115.281645: hrtick_clear <-hrtick_set
1243 bash-4043 [01] 115.281646: wait_for_completion <-__stop_machine_run
1244 bash-4043 [01] 115.281647: wait_for_common <-wait_for_completion
1245 bash-4043 [01] 115.281647: kthread_stop <-stop_machine_run
1246 bash-4043 [01] 115.281648: init_waitqueue_head <-kthread_stop
1247 bash-4043 [01] 115.281648: wake_up_process <-kthread_stop
1248 bash-4043 [01] 115.281649: try_to_wake_up <-wake_up_process
1249
1250We can see that there's no more lock or preempt tracing.
1251
1252ftraced
1253-------
1254
1255As mentioned above, when dynamic ftrace is configured in, a kernel
1256thread wakes up once a second and checks to see if there are mcount
1257calls that need to be converted into nops. If there are not any, then
1258it simply goes back to sleep. But if there are some, it will call
1259kstop_machine to convert the calls to nops.
1260
1261There may be a case that you do not want this added latency.
1262Perhaps you are doing some audio recording and this activity might
1263cause skips in the playback. There is an interface to disable
1264and enable the ftraced kernel thread.
1265
1266 # echo 0 > /debug/tracing/ftraced_enabled
1267
1268This will disable the calling of the kstop_machine to update the
1269mcount calls to nops. Remember that there's a large overhead
1270to calling mcount. Without this kernel thread, that overhead will
1271exist.
1272
1273If there are recorded calls to mcount, any write to the ftraced_enabled
1274file will cause the kstop_machine to run. This means that a
1275user can manually perform the updates when they want to by simply
1276echoing a '0' into the ftraced_enabled file.
1277
1278The updates are also done at the beginning of enabling a tracer
1279that uses ftrace function recording.
1280
1281
1282trace_pipe
1283----------
1284
1285The trace_pipe outputs the same as trace, but the effect on the
1286tracing is different. Every read from trace_pipe is consumed.
1287This means that subsequent reads will be different. The trace
1288is live.
1289
1290 # echo ftrace > /debug/tracing/current_tracer
1291 # cat /debug/tracing/trace_pipe > /tmp/trace.out &
1292[1] 4153
1293 # echo 1 > /debug/tracing/tracing_enabled
1294 # usleep 1
1295 # echo 0 > /debug/tracing/tracing_enabled
1296 # cat /debug/tracing/trace
1297# tracer: ftrace
1298#
1299# TASK-PID CPU# TIMESTAMP FUNCTION
1300# | | | | |
1301
1302 #
1303 # cat /tmp/trace.out
1304 bash-4043 [00] 41.267106: finish_task_switch <-schedule
1305 bash-4043 [00] 41.267106: hrtick_set <-schedule
1306 bash-4043 [00] 41.267107: hrtick_clear <-hrtick_set
1307 bash-4043 [00] 41.267108: wait_for_completion <-__stop_machine_run
1308 bash-4043 [00] 41.267108: wait_for_common <-wait_for_completion
1309 bash-4043 [00] 41.267109: kthread_stop <-stop_machine_run
1310 bash-4043 [00] 41.267109: init_waitqueue_head <-kthread_stop
1311 bash-4043 [00] 41.267110: wake_up_process <-kthread_stop
1312 bash-4043 [00] 41.267110: try_to_wake_up <-wake_up_process
1313 bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up
1314
1315
1316Note, reading the trace_pipe will block until more input is added.
1317By changing the tracer, trace_pipe will issue an EOF. We needed
1318to set the ftrace tracer _before_ cating the trace_pipe file.
1319
1320
1321trace entries
1322-------------
1323
1324Having too much or not enough data can be troublesome in diagnosing
1325some issue in the kernel. The file trace_entries is used to modify
1326the size of the internal trace buffers. The number listed
1327is the number of entries that can be recorded per CPU. To know
1328the full size, multiply the number of possible CPUS with the
1329number of entries.
1330
1331 # cat /debug/tracing/trace_entries
133265620
1333
1334Note, to modify this, you must have tracing completely disabled. To do that,
1335echo "none" into the current_tracer.
1336
1337 # echo none > /debug/tracing/current_tracer
1338 # echo 100000 > /debug/tracing/trace_entries
1339 # cat /debug/tracing/trace_entries
1340100045
1341
1342
1343Notice that we echoed in 100,000 but the size is 100,045. The entries
1344are held by individual pages. It allocates the number of pages it takes
1345to fulfill the request. If more entries may fit on the last page
1346it will add them.
1347
1348 # echo 1 > /debug/tracing/trace_entries
1349 # cat /debug/tracing/trace_entries
135085
1351
1352This shows us that 85 entries can fit on a single page.
1353
1354The number of pages that will be allocated is a percentage of available
1355memory. Allocating too much will produce an error.
1356
1357 # echo 1000000000000 > /debug/tracing/trace_entries
1358-bash: echo: write error: Cannot allocate memory
1359 # cat /debug/tracing/trace_entries
136085
1361
diff --git a/Documentation/i2c/busses/i2c-i810 b/Documentation/i2c/busses/i2c-i810
deleted file mode 100644
index 778210ee1583..000000000000
--- a/Documentation/i2c/busses/i2c-i810
+++ /dev/null
@@ -1,47 +0,0 @@
1Kernel driver i2c-i810
2
3Supported adapters:
4 * Intel 82810, 82810-DC100, 82810E, and 82815 (GMCH)
5 * Intel 82845G (GMCH)
6
7Authors:
8 Frodo Looijaard <frodol@dds.nl>,
9 Philip Edelbrock <phil@netroedge.com>,
10 Kyösti Mälkki <kmalkki@cc.hut.fi>,
11 Ralph Metzler <rjkm@thp.uni-koeln.de>,
12 Mark D. Studebaker <mdsxyz123@yahoo.com>
13
14Main contact: Mark Studebaker <mdsxyz123@yahoo.com>
15
16Description
17-----------
18
19WARNING: If you have an '810' or '815' motherboard, your standard I2C
20temperature sensors are most likely on the 801's I2C bus. You want the
21i2c-i801 driver for those, not this driver.
22
23Now for the i2c-i810...
24
25The GMCH chip contains two I2C interfaces.
26
27The first interface is used for DDC (Data Display Channel) which is a
28serial channel through the VGA monitor connector to a DDC-compliant
29monitor. This interface is defined by the Video Electronics Standards
30Association (VESA). The standards are available for purchase at
31http://www.vesa.org .
32
33The second interface is a general-purpose I2C bus. It may be connected to a
34TV-out chip such as the BT869 or possibly to a digital flat-panel display.
35
36Features
37--------
38
39Both busses use the i2c-algo-bit driver for 'bit banging'
40and support for specific transactions is provided by i2c-algo-bit.
41
42Issues
43------
44
45If you enable bus testing in i2c-algo-bit (insmod i2c-algo-bit bit_test=1),
46the test may fail; if so, the i2c-i810 driver won't be inserted. However,
47we think this has been fixed.
diff --git a/Documentation/i2c/busses/i2c-prosavage b/Documentation/i2c/busses/i2c-prosavage
deleted file mode 100644
index 703687902511..000000000000
--- a/Documentation/i2c/busses/i2c-prosavage
+++ /dev/null
@@ -1,23 +0,0 @@
1Kernel driver i2c-prosavage
2
3Supported adapters:
4
5 S3/VIA KM266/VT8375 aka ProSavage8
6 S3/VIA KM133/VT8365 aka Savage4
7
8Author: Henk Vergonet <henk@god.dyndns.org>
9
10Description
11-----------
12
13The Savage4 chips contain two I2C interfaces (aka a I2C 'master' or
14'host').
15
16The first interface is used for DDC (Data Display Channel) which is a
17serial channel through the VGA monitor connector to a DDC-compliant
18monitor. This interface is defined by the Video Electronics Standards
19Association (VESA). The standards are available for purchase at
20http://www.vesa.org . The second interface is a general-purpose I2C bus.
21
22Usefull for gaining access to the TV Encoder chips.
23
diff --git a/Documentation/i2c/busses/i2c-savage4 b/Documentation/i2c/busses/i2c-savage4
deleted file mode 100644
index 6ecceab618d3..000000000000
--- a/Documentation/i2c/busses/i2c-savage4
+++ /dev/null
@@ -1,26 +0,0 @@
1Kernel driver i2c-savage4
2
3Supported adapters:
4 * Savage4
5 * Savage2000
6
7Authors:
8 Alexander Wold <awold@bigfoot.com>,
9 Mark D. Studebaker <mdsxyz123@yahoo.com>
10
11Description
12-----------
13
14The Savage4 chips contain two I2C interfaces (aka a I2C 'master'
15or 'host').
16
17The first interface is used for DDC (Data Display Channel) which is a
18serial channel through the VGA monitor connector to a DDC-compliant
19monitor. This interface is defined by the Video Electronics Standards
20Association (VESA). The standards are available for purchase at
21http://www.vesa.org . The DDC bus is not yet supported because its register
22is not directly memory-mapped.
23
24The second interface is a general-purpose I2C bus. This is the only
25interface supported by the driver at the moment.
26
diff --git a/Documentation/i2c/fault-codes b/Documentation/i2c/fault-codes
new file mode 100644
index 000000000000..045765c0b9b5
--- /dev/null
+++ b/Documentation/i2c/fault-codes
@@ -0,0 +1,127 @@
1This is a summary of the most important conventions for use of fault
2codes in the I2C/SMBus stack.
3
4
5A "Fault" is not always an "Error"
6----------------------------------
7Not all fault reports imply errors; "page faults" should be a familiar
8example. Software often retries idempotent operations after transient
9faults. There may be fancier recovery schemes that are appropriate in
10some cases, such as re-initializing (and maybe resetting). After such
11recovery, triggered by a fault report, there is no error.
12
13In a similar way, sometimes a "fault" code just reports one defined
14result for an operation ... it doesn't indicate that anything is wrong
15at all, just that the outcome wasn't on the "golden path".
16
17In short, your I2C driver code may need to know these codes in order
18to respond correctly. Other code may need to rely on YOUR code reporting
19the right fault code, so that it can (in turn) behave correctly.
20
21
22I2C and SMBus fault codes
23-------------------------
24These are returned as negative numbers from most calls, with zero or
25some positive number indicating a non-fault return. The specific
26numbers associated with these symbols differ between architectures,
27though most Linux systems use <asm-generic/errno*.h> numbering.
28
29Note that the descriptions here are not exhaustive. There are other
30codes that may be returned, and other cases where these codes should
31be returned. However, drivers should not return other codes for these
32cases (unless the hardware doesn't provide unique fault reports).
33
34Also, codes returned by adapter probe methods follow rules which are
35specific to their host bus (such as PCI, or the platform bus).
36
37
38EAGAIN
39 Returned by I2C adapters when they lose arbitration in master
40 transmit mode: some other master was transmitting different
41 data at the same time.
42
43 Also returned when trying to invoke an I2C operation in an
44 atomic context, when some task is already using that I2C bus
45 to execute some other operation.
46
47EBADMSG
48 Returned by SMBus logic when an invalid Packet Error Code byte
49 is received. This code is a CRC covering all bytes in the
50 transaction, and is sent before the terminating STOP. This
51 fault is only reported on read transactions; the SMBus slave
52 may have a way to report PEC mismatches on writes from the
53 host. Note that even if PECs are in use, you should not rely
54 on these as the only way to detect incorrect data transfers.
55
56EBUSY
57 Returned by SMBus adapters when the bus was busy for longer
58 than allowed. This usually indicates some device (maybe the
59 SMBus adapter) needs some fault recovery (such as resetting),
60 or that the reset was attempted but failed.
61
62EINVAL
63 This rather vague error means an invalid parameter has been
64 detected before any I/O operation was started. Use a more
65 specific fault code when you can.
66
67 One example would be a driver trying an SMBus Block Write
68 with block size outside the range of 1-32 bytes.
69
70EIO
71 This rather vague error means something went wrong when
72 performing an I/O operation. Use a more specific fault
73 code when you can.
74
75ENODEV
76 Returned by driver probe() methods. This is a bit more
77 specific than ENXIO, implying the problem isn't with the
78 address, but with the device found there. Driver probes
79 may verify the device returns *correct* responses, and
80 return this as appropriate. (The driver core will warn
81 about probe faults other than ENXIO and ENODEV.)
82
83ENOMEM
84 Returned by any component that can't allocate memory when
85 it needs to do so.
86
87ENXIO
88 Returned by I2C adapters to indicate that the address phase
89 of a transfer didn't get an ACK. While it might just mean
90 an I2C device was temporarily not responding, usually it
91 means there's nothing listening at that address.
92
93 Returned by driver probe() methods to indicate that they
94 found no device to bind to. (ENODEV may also be used.)
95
96EOPNOTSUPP
97 Returned by an adapter when asked to perform an operation
98 that it doesn't, or can't, support.
99
100 For example, this would be returned when an adapter that
101 doesn't support SMBus block transfers is asked to execute
102 one. In that case, the driver making that request should
103 have verified that functionality was supported before it
104 made that block transfer request.
105
106 Similarly, if an I2C adapter can't execute all legal I2C
107 messages, it should return this when asked to perform a
108 transaction it can't. (These limitations can't be seen in
109 the adapter's functionality mask, since the assumption is
110 that if an adapter supports I2C it supports all of I2C.)
111
112EPROTO
113 Returned when slave does not conform to the relevant I2C
114 or SMBus (or chip-specific) protocol specifications. One
115 case is when the length of an SMBus block data response
116 (from the SMBus slave) is outside the range 1-32 bytes.
117
118ETIMEDOUT
119 This is returned by drivers when an operation took too much
120 time, and was aborted before it completed.
121
122 SMBus adapters may return it when an operation took more
123 time than allowed by the SMBus specification; for example,
124 when a slave stretches clocks too far. I2C has no such
125 timeouts, but it's normal for I2C adapters to impose some
126 arbitrary limits (much longer than SMBus!) too.
127
diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol
index 03f08fb491cc..24bfb65da17d 100644
--- a/Documentation/i2c/smbus-protocol
+++ b/Documentation/i2c/smbus-protocol
@@ -42,8 +42,8 @@ Count (8 bits): A data byte containing the length of a block operation.
42[..]: Data sent by I2C device, as opposed to data sent by the host adapter. 42[..]: Data sent by I2C device, as opposed to data sent by the host adapter.
43 43
44 44
45SMBus Quick Command: i2c_smbus_write_quick() 45SMBus Quick Command
46============================================= 46===================
47 47
48This sends a single bit to the device, at the place of the Rd/Wr bit. 48This sends a single bit to the device, at the place of the Rd/Wr bit.
49 49
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index ee75cbace28d..6b61b3a2e90b 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -25,14 +25,29 @@ routines, and should be zero-initialized except for fields with data you
25provide. A client structure holds device-specific information like the 25provide. A client structure holds device-specific information like the
26driver model device node, and its I2C address. 26driver model device node, and its I2C address.
27 27
28/* iff driver uses driver model ("new style") binding model: */
29
30static struct i2c_device_id foo_idtable[] = {
31 { "foo", my_id_for_foo },
32 { "bar", my_id_for_bar },
33 { }
34};
35
36MODULE_DEVICE_TABLE(i2c, foo_idtable);
37
28static struct i2c_driver foo_driver = { 38static struct i2c_driver foo_driver = {
29 .driver = { 39 .driver = {
30 .name = "foo", 40 .name = "foo",
31 }, 41 },
32 42
33 /* iff driver uses driver model ("new style") binding model: */ 43 /* iff driver uses driver model ("new style") binding model: */
44 .id_table = foo_ids,
34 .probe = foo_probe, 45 .probe = foo_probe,
35 .remove = foo_remove, 46 .remove = foo_remove,
47 /* if device autodetection is needed: */
48 .class = I2C_CLASS_SOMETHING,
49 .detect = foo_detect,
50 .address_data = &addr_data,
36 51
37 /* else, driver uses "legacy" binding model: */ 52 /* else, driver uses "legacy" binding model: */
38 .attach_adapter = foo_attach_adapter, 53 .attach_adapter = foo_attach_adapter,
@@ -173,10 +188,9 @@ handle may be used during foo_probe(). If foo_probe() reports success
173(zero not a negative status code) it may save the handle and use it until 188(zero not a negative status code) it may save the handle and use it until
174foo_remove() returns. That binding model is used by most Linux drivers. 189foo_remove() returns. That binding model is used by most Linux drivers.
175 190
176Drivers match devices when i2c_client.driver_name and the driver name are 191The probe function is called when an entry in the id_table name field
177the same; this approach is used in several other busses that don't have 192matches the device's name. It is passed the entry that was matched so
178device typing support in the hardware. The driver and module name should 193the driver knows which one in the table matched.
179match, so hotplug/coldplug mechanisms will modprobe the driver.
180 194
181 195
182Device Creation (Standard driver model) 196Device Creation (Standard driver model)
@@ -207,6 +221,31 @@ in the I2C bus driver. You may want to save the returned i2c_client
207reference for later use. 221reference for later use.
208 222
209 223
224Device Detection (Standard driver model)
225----------------------------------------
226
227Sometimes you do not know in advance which I2C devices are connected to
228a given I2C bus. This is for example the case of hardware monitoring
229devices on a PC's SMBus. In that case, you may want to let your driver
230detect supported devices automatically. This is how the legacy model
231was working, and is now available as an extension to the standard
232driver model (so that we can finally get rid of the legacy model.)
233
234You simply have to define a detect callback which will attempt to
235identify supported devices (returning 0 for supported ones and -ENODEV
236for unsupported ones), a list of addresses to probe, and a device type
237(or class) so that only I2C buses which may have that type of device
238connected (and not otherwise enumerated) will be probed. The i2c
239core will then call you back as needed and will instantiate a device
240for you for every successful detection.
241
242Note that this mechanism is purely optional and not suitable for all
243devices. You need some reliable way to identify the supported devices
244(typically using device-specific, dedicated identification registers),
245otherwise misdetections are likely to occur and things can get wrong
246quickly.
247
248
210Device Deletion (Standard driver model) 249Device Deletion (Standard driver model)
211--------------------------------------- 250---------------------------------------
212 251
@@ -559,7 +598,6 @@ SMBus communication
559 in terms of it. Never use this function directly! 598 in terms of it. Never use this function directly!
560 599
561 600
562 extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value);
563 extern s32 i2c_smbus_read_byte(struct i2c_client * client); 601 extern s32 i2c_smbus_read_byte(struct i2c_client * client);
564 extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value); 602 extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value);
565 extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command); 603 extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command);
@@ -568,30 +606,31 @@ SMBus communication
568 extern s32 i2c_smbus_read_word_data(struct i2c_client * client, u8 command); 606 extern s32 i2c_smbus_read_word_data(struct i2c_client * client, u8 command);
569 extern s32 i2c_smbus_write_word_data(struct i2c_client * client, 607 extern s32 i2c_smbus_write_word_data(struct i2c_client * client,
570 u8 command, u16 value); 608 u8 command, u16 value);
609 extern s32 i2c_smbus_read_block_data(struct i2c_client * client,
610 u8 command, u8 *values);
571 extern s32 i2c_smbus_write_block_data(struct i2c_client * client, 611 extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
572 u8 command, u8 length, 612 u8 command, u8 length,
573 u8 *values); 613 u8 *values);
574 extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client, 614 extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
575 u8 command, u8 length, u8 *values); 615 u8 command, u8 length, u8 *values);
576
577These ones were removed in Linux 2.6.10 because they had no users, but could
578be added back later if needed:
579
580 extern s32 i2c_smbus_read_block_data(struct i2c_client * client,
581 u8 command, u8 *values);
582 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client, 616 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
583 u8 command, u8 length, 617 u8 command, u8 length,
584 u8 *values); 618 u8 *values);
619
620These ones were removed from i2c-core because they had no users, but could
621be added back later if needed:
622
623 extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value);
585 extern s32 i2c_smbus_process_call(struct i2c_client * client, 624 extern s32 i2c_smbus_process_call(struct i2c_client * client,
586 u8 command, u16 value); 625 u8 command, u16 value);
587 extern s32 i2c_smbus_block_process_call(struct i2c_client *client, 626 extern s32 i2c_smbus_block_process_call(struct i2c_client *client,
588 u8 command, u8 length, 627 u8 command, u8 length,
589 u8 *values) 628 u8 *values)
590 629
591All these transactions return -1 on failure. The 'write' transactions 630All these transactions return a negative errno value on failure. The 'write'
592return 0 on success; the 'read' transactions return the read value, except 631transactions return 0 on success; the 'read' transactions return the read
593for read_block, which returns the number of values read. The block buffers 632value, except for block transactions, which return the number of values
594need not be longer than 32 bytes. 633read. The block buffers need not be longer than 32 bytes.
595 634
596You can read the file `smbus-protocol' for more information about the 635You can read the file `smbus-protocol' for more information about the
597actual SMBus protocol. 636actual SMBus protocol.
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 240ce7a56c40..3bb5f466a90d 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -117,6 +117,7 @@ Code Seq# Include File Comments
117 <mailto:natalia@nikhefk.nikhef.nl> 117 <mailto:natalia@nikhefk.nikhef.nl>
118'c' 00-7F linux/comstats.h conflict! 118'c' 00-7F linux/comstats.h conflict!
119'c' 00-7F linux/coda.h conflict! 119'c' 00-7F linux/coda.h conflict!
120'c' 80-9F asm-s390/chsc.h
120'd' 00-FF linux/char/drm/drm/h conflict! 121'd' 00-FF linux/char/drm/drm/h conflict!
121'd' 00-DF linux/video_decoder.h conflict! 122'd' 00-DF linux/video_decoder.h conflict!
122'd' F0-FF linux/digi1.h 123'd' F0-FF linux/digi1.h
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index b8e52c0355d3..9691c7f5166c 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -109,7 +109,7 @@ There are two possible methods of using Kdump.
1092) Or use the system kernel binary itself as dump-capture kernel and there is 1092) Or use the system kernel binary itself as dump-capture kernel and there is
110 no need to build a separate dump-capture kernel. This is possible 110 no need to build a separate dump-capture kernel. This is possible
111 only with the architecutres which support a relocatable kernel. As 111 only with the architecutres which support a relocatable kernel. As
112 of today i386 and ia64 architectures support relocatable kernel. 112 of today, i386, x86_64 and ia64 architectures support relocatable kernel.
113 113
114Building a relocatable kernel is advantageous from the point of view that 114Building a relocatable kernel is advantageous from the point of view that
115one does not have to build a second kernel for capturing the dump. But 115one does not have to build a second kernel for capturing the dump. But
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 042588fa12e5..b8451f771460 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -271,6 +271,17 @@ and is between 256 and 4096 characters. It is defined in the file
271 aic79xx= [HW,SCSI] 271 aic79xx= [HW,SCSI]
272 See Documentation/scsi/aic79xx.txt. 272 See Documentation/scsi/aic79xx.txt.
273 273
274 amd_iommu= [HW,X86-84]
275 Pass parameters to the AMD IOMMU driver in the system.
276 Possible values are:
277 isolate - enable device isolation (each device, as far
278 as possible, will get its own protection
279 domain)
280 amd_iommu_size= [HW,X86-64]
281 Define the size of the aperture for the AMD IOMMU
282 driver. Possible values are:
283 '32M', '64M' (default), '128M', '256M', '512M', '1G'
284
274 amijoy.map= [HW,JOY] Amiga joystick support 285 amijoy.map= [HW,JOY] Amiga joystick support
275 Map of devices attached to JOY0DAT and JOY1DAT 286 Map of devices attached to JOY0DAT and JOY1DAT
276 Format: <a>,<b> 287 Format: <a>,<b>
@@ -295,7 +306,7 @@ and is between 256 and 4096 characters. It is defined in the file
295 when initialising the APIC and IO-APIC components. 306 when initialising the APIC and IO-APIC components.
296 307
297 apm= [APM] Advanced Power Management 308 apm= [APM] Advanced Power Management
298 See header of arch/i386/kernel/apm.c. 309 See header of arch/x86/kernel/apm_32.c.
299 310
300 arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards 311 arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
301 Format: <io>,<irq>,<nodeID> 312 Format: <io>,<irq>,<nodeID>
@@ -560,6 +571,8 @@ and is between 256 and 4096 characters. It is defined in the file
560 571
561 debug_objects [KNL] Enable object debugging 572 debug_objects [KNL] Enable object debugging
562 573
574 debugpat [X86] Enable PAT debugging
575
563 decnet.addr= [HW,NET] 576 decnet.addr= [HW,NET]
564 Format: <area>[,<node>] 577 Format: <area>[,<node>]
565 See also Documentation/networking/decnet.txt. 578 See also Documentation/networking/decnet.txt.
@@ -599,6 +612,29 @@ and is between 256 and 4096 characters. It is defined in the file
599 See drivers/char/README.epca and 612 See drivers/char/README.epca and
600 Documentation/digiepca.txt. 613 Documentation/digiepca.txt.
601 614
615 disable_mtrr_cleanup [X86]
616 enable_mtrr_cleanup [X86]
617 The kernel tries to adjust MTRR layout from continuous
618 to discrete, to make X server driver able to add WB
619 entry later. This parameter enables/disables that.
620
621 mtrr_chunk_size=nn[KMG] [X86]
622 used for mtrr cleanup. It is largest continous chunk
623 that could hold holes aka. UC entries.
624
625 mtrr_gran_size=nn[KMG] [X86]
626 Used for mtrr cleanup. It is granularity of mtrr block.
627 Default is 1.
628 Large value could prevent small alignment from
629 using up MTRRs.
630
631 mtrr_spare_reg_nr=n [X86]
632 Format: <integer>
633 Range: 0,7 : spare reg number
634 Default : 1
635 Used for mtrr cleanup. It is spare mtrr entries number.
636 Set to 2 or more if your graphical card needs more.
637
602 disable_mtrr_trim [X86, Intel and AMD only] 638 disable_mtrr_trim [X86, Intel and AMD only]
603 By default the kernel will trim any uncacheable 639 By default the kernel will trim any uncacheable
604 memory out of your available memory pool based on 640 memory out of your available memory pool based on
@@ -638,7 +674,7 @@ and is between 256 and 4096 characters. It is defined in the file
638 674
639 elanfreq= [X86-32] 675 elanfreq= [X86-32]
640 See comment before function elanfreq_setup() in 676 See comment before function elanfreq_setup() in
641 arch/i386/kernel/cpu/cpufreq/elanfreq.c. 677 arch/x86/kernel/cpu/cpufreq/elanfreq.c.
642 678
643 elevator= [IOSCHED] 679 elevator= [IOSCHED]
644 Format: {"anticipatory" | "cfq" | "deadline" | "noop"} 680 Format: {"anticipatory" | "cfq" | "deadline" | "noop"}
@@ -722,9 +758,6 @@ and is between 256 and 4096 characters. It is defined in the file
722 hd= [EIDE] (E)IDE hard drive subsystem geometry 758 hd= [EIDE] (E)IDE hard drive subsystem geometry
723 Format: <cyl>,<head>,<sect> 759 Format: <cyl>,<head>,<sect>
724 760
725 hd?= [HW] (E)IDE subsystem
726 hd?lun= See Documentation/ide/ide.txt.
727
728 highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact 761 highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
729 size of <nn>. This works even on boxes that have no 762 size of <nn>. This works even on boxes that have no
730 highmem otherwise. This also works to reduce highmem 763 highmem otherwise. This also works to reduce highmem
@@ -1208,6 +1241,11 @@ and is between 256 and 4096 characters. It is defined in the file
1208 mtdparts= [MTD] 1241 mtdparts= [MTD]
1209 See drivers/mtd/cmdlinepart.c. 1242 See drivers/mtd/cmdlinepart.c.
1210 1243
1244 mtdset= [ARM]
1245 ARM/S3C2412 JIVE boot control
1246
1247 See arch/arm/mach-s3c2412/mach-jive.c
1248
1211 mtouchusb.raw_coordinates= 1249 mtouchusb.raw_coordinates=
1212 [HW] Make the MicroTouch USB driver use raw coordinates 1250 [HW] Make the MicroTouch USB driver use raw coordinates
1213 ('y', default) or cooked coordinates ('n') 1251 ('y', default) or cooked coordinates ('n')
@@ -1571,6 +1609,10 @@ and is between 256 and 4096 characters. It is defined in the file
1571 Format: { parport<nr> | timid | 0 } 1609 Format: { parport<nr> | timid | 0 }
1572 See also Documentation/parport.txt. 1610 See also Documentation/parport.txt.
1573 1611
1612 pmtmr= [X86] Manual setup of pmtmr I/O Port.
1613 Override pmtimer IOPort with a hex value.
1614 e.g. pmtmr=0x508
1615
1574 pnpacpi= [ACPI] 1616 pnpacpi= [ACPI]
1575 { off } 1617 { off }
1576 1618
@@ -1679,6 +1721,10 @@ and is between 256 and 4096 characters. It is defined in the file
1679 Format: <reboot_mode>[,<reboot_mode2>[,...]] 1721 Format: <reboot_mode>[,<reboot_mode2>[,...]]
1680 See arch/*/kernel/reboot.c or arch/*/kernel/process.c 1722 See arch/*/kernel/reboot.c or arch/*/kernel/process.c
1681 1723
1724 relax_domain_level=
1725 [KNL, SMP] Set scheduler's default relax_domain_level.
1726 See Documentation/cpusets.txt.
1727
1682 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 1728 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
1683 1729
1684 reservetop= [X86-32] 1730 reservetop= [X86-32]
@@ -2115,6 +2161,9 @@ and is between 256 and 4096 characters. It is defined in the file
2115 usbhid.mousepoll= 2161 usbhid.mousepoll=
2116 [USBHID] The interval which mice are to be polled at. 2162 [USBHID] The interval which mice are to be polled at.
2117 2163
2164 add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in
2165 kernel's map of available physical RAM.
2166
2118 vdso= [X86-32,SH,x86-64] 2167 vdso= [X86-32,SH,x86-64]
2119 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2168 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2120 vdso=1: enable VDSO (default) 2169 vdso=1: enable VDSO (default)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 17a6e46fbd43..946b66e1b652 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -81,23 +81,23 @@ inet_peer_minttl - INTEGER
81 Minimum time-to-live of entries. Should be enough to cover fragment 81 Minimum time-to-live of entries. Should be enough to cover fragment
82 time-to-live on the reassembling side. This minimum time-to-live is 82 time-to-live on the reassembling side. This minimum time-to-live is
83 guaranteed if the pool size is less than inet_peer_threshold. 83 guaranteed if the pool size is less than inet_peer_threshold.
84 Measured in jiffies(1). 84 Measured in seconds.
85 85
86inet_peer_maxttl - INTEGER 86inet_peer_maxttl - INTEGER
87 Maximum time-to-live of entries. Unused entries will expire after 87 Maximum time-to-live of entries. Unused entries will expire after
88 this period of time if there is no memory pressure on the pool (i.e. 88 this period of time if there is no memory pressure on the pool (i.e.
89 when the number of entries in the pool is very small). 89 when the number of entries in the pool is very small).
90 Measured in jiffies(1). 90 Measured in seconds.
91 91
92inet_peer_gc_mintime - INTEGER 92inet_peer_gc_mintime - INTEGER
93 Minimum interval between garbage collection passes. This interval is 93 Minimum interval between garbage collection passes. This interval is
94 in effect under high memory pressure on the pool. 94 in effect under high memory pressure on the pool.
95 Measured in jiffies(1). 95 Measured in seconds.
96 96
97inet_peer_gc_maxtime - INTEGER 97inet_peer_gc_maxtime - INTEGER
98 Minimum interval between garbage collection passes. This interval is 98 Minimum interval between garbage collection passes. This interval is
99 in effect under low (or absent) memory pressure on the pool. 99 in effect under low (or absent) memory pressure on the pool.
100 Measured in jiffies(1). 100 Measured in seconds.
101 101
102TCP variables: 102TCP variables:
103 103
@@ -148,9 +148,9 @@ tcp_available_congestion_control - STRING
148 but not loaded. 148 but not loaded.
149 149
150tcp_base_mss - INTEGER 150tcp_base_mss - INTEGER
151 The initial value of search_low to be used by Packetization Layer 151 The initial value of search_low to be used by the packetization layer
152 Path MTU Discovery (MTU probing). If MTU probing is enabled, 152 Path MTU discovery (MTU probing). If MTU probing is enabled,
153 this is the inital MSS used by the connection. 153 this is the initial MSS used by the connection.
154 154
155tcp_congestion_control - STRING 155tcp_congestion_control - STRING
156 Set the congestion control algorithm to be used for new 156 Set the congestion control algorithm to be used for new
@@ -185,10 +185,9 @@ tcp_frto - INTEGER
185 timeouts. It is particularly beneficial in wireless environments 185 timeouts. It is particularly beneficial in wireless environments
186 where packet loss is typically due to random radio interference 186 where packet loss is typically due to random radio interference
187 rather than intermediate router congestion. F-RTO is sender-side 187 rather than intermediate router congestion. F-RTO is sender-side
188 only modification. Therefore it does not require any support from 188 only modification. Therefore it does not require any support from
189 the peer, but in a typical case, however, where wireless link is 189 the peer.
190 the local access link and most of the data flows downlink, the 190
191 faraway servers should have F-RTO enabled to take advantage of it.
192 If set to 1, basic version is enabled. 2 enables SACK enhanced 191 If set to 1, basic version is enabled. 2 enables SACK enhanced
193 F-RTO if flow uses SACK. The basic version can be used also when 192 F-RTO if flow uses SACK. The basic version can be used also when
194 SACK is in use though scenario(s) with it exists where F-RTO 193 SACK is in use though scenario(s) with it exists where F-RTO
@@ -276,7 +275,7 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
276 memory. 275 memory.
277 276
278tcp_moderate_rcvbuf - BOOLEAN 277tcp_moderate_rcvbuf - BOOLEAN
279 If set, TCP performs receive buffer autotuning, attempting to 278 If set, TCP performs receive buffer auto-tuning, attempting to
280 automatically size the buffer (no greater than tcp_rmem[2]) to 279 automatically size the buffer (no greater than tcp_rmem[2]) to
281 match the size required by the path for full throughput. Enabled by 280 match the size required by the path for full throughput. Enabled by
282 default. 281 default.
@@ -336,7 +335,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
336 pressure. 335 pressure.
337 Default: 8K 336 Default: 8K
338 337
339 default: default size of receive buffer used by TCP sockets. 338 default: initial size of receive buffer used by TCP sockets.
340 This value overrides net.core.rmem_default used by other protocols. 339 This value overrides net.core.rmem_default used by other protocols.
341 Default: 87380 bytes. This value results in window of 65535 with 340 Default: 87380 bytes. This value results in window of 65535 with
342 default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit 341 default setting of tcp_adv_win_scale and tcp_app_win:0 and a bit
@@ -344,8 +343,10 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
344 343
345 max: maximal size of receive buffer allowed for automatically 344 max: maximal size of receive buffer allowed for automatically
346 selected receiver buffers for TCP socket. This value does not override 345 selected receiver buffers for TCP socket. This value does not override
347 net.core.rmem_max, "static" selection via SO_RCVBUF does not use this. 346 net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables
348 Default: 87380*2 bytes. 347 automatic tuning of that socket's receive buffer size, in which
348 case this value is ignored.
349 Default: between 87380B and 4MB, depending on RAM size.
349 350
350tcp_sack - BOOLEAN 351tcp_sack - BOOLEAN
351 Enable select acknowledgments (SACKS). 352 Enable select acknowledgments (SACKS).
@@ -358,7 +359,7 @@ tcp_slow_start_after_idle - BOOLEAN
358 Default: 1 359 Default: 1
359 360
360tcp_stdurg - BOOLEAN 361tcp_stdurg - BOOLEAN
361 Use the Host requirements interpretation of the TCP urg pointer field. 362 Use the Host requirements interpretation of the TCP urgent pointer field.
362 Most hosts use the older BSD interpretation, so if you turn this on 363 Most hosts use the older BSD interpretation, so if you turn this on
363 Linux might not communicate correctly with them. 364 Linux might not communicate correctly with them.
364 Default: FALSE 365 Default: FALSE
@@ -371,12 +372,12 @@ tcp_synack_retries - INTEGER
371tcp_syncookies - BOOLEAN 372tcp_syncookies - BOOLEAN
372 Only valid when the kernel was compiled with CONFIG_SYNCOOKIES 373 Only valid when the kernel was compiled with CONFIG_SYNCOOKIES
373 Send out syncookies when the syn backlog queue of a socket 374 Send out syncookies when the syn backlog queue of a socket
374 overflows. This is to prevent against the common 'syn flood attack' 375 overflows. This is to prevent against the common 'SYN flood attack'
375 Default: FALSE 376 Default: FALSE
376 377
377 Note, that syncookies is fallback facility. 378 Note, that syncookies is fallback facility.
378 It MUST NOT be used to help highly loaded servers to stand 379 It MUST NOT be used to help highly loaded servers to stand
379 against legal connection rate. If you see synflood warnings 380 against legal connection rate. If you see SYN flood warnings
380 in your logs, but investigation shows that they occur 381 in your logs, but investigation shows that they occur
381 because of overload with legal connections, you should tune 382 because of overload with legal connections, you should tune
382 another parameters until this warning disappear. 383 another parameters until this warning disappear.
@@ -386,7 +387,7 @@ tcp_syncookies - BOOLEAN
386 to use TCP extensions, can result in serious degradation 387 to use TCP extensions, can result in serious degradation
387 of some services (f.e. SMTP relaying), visible not by you, 388 of some services (f.e. SMTP relaying), visible not by you,
388 but your clients and relays, contacting you. While you see 389 but your clients and relays, contacting you. While you see
389 synflood warnings in logs not being really flooded, your server 390 SYN flood warnings in logs not being really flooded, your server
390 is seriously misconfigured. 391 is seriously misconfigured.
391 392
392tcp_syn_retries - INTEGER 393tcp_syn_retries - INTEGER
@@ -419,19 +420,21 @@ tcp_window_scaling - BOOLEAN
419 Enable window scaling as defined in RFC1323. 420 Enable window scaling as defined in RFC1323.
420 421
421tcp_wmem - vector of 3 INTEGERs: min, default, max 422tcp_wmem - vector of 3 INTEGERs: min, default, max
422 min: Amount of memory reserved for send buffers for TCP socket. 423 min: Amount of memory reserved for send buffers for TCP sockets.
423 Each TCP socket has rights to use it due to fact of its birth. 424 Each TCP socket has rights to use it due to fact of its birth.
424 Default: 4K 425 Default: 4K
425 426
426 default: Amount of memory allowed for send buffers for TCP socket 427 default: initial size of send buffer used by TCP sockets. This
427 by default. This value overrides net.core.wmem_default used 428 value overrides net.core.wmem_default used by other protocols.
428 by other protocols, it is usually lower than net.core.wmem_default. 429 It is usually lower than net.core.wmem_default.
429 Default: 16K 430 Default: 16K
430 431
431 max: Maximal amount of memory allowed for automatically selected 432 max: Maximal amount of memory allowed for automatically tuned
432 send buffers for TCP socket. This value does not override 433 send buffers for TCP sockets. This value does not override
433 net.core.wmem_max, "static" selection via SO_SNDBUF does not use this. 434 net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables
434 Default: 128K 435 automatic tuning of that socket's send buffer size, in which case
436 this value is ignored.
437 Default: between 64K and 4MB, depending on RAM size.
435 438
436tcp_workaround_signed_windows - BOOLEAN 439tcp_workaround_signed_windows - BOOLEAN
437 If set, assume no receipt of a window scaling option means the 440 If set, assume no receipt of a window scaling option means the
@@ -794,10 +797,6 @@ tag - INTEGER
794 Allows you to write a number, which can be used as required. 797 Allows you to write a number, which can be used as required.
795 Default value is 0. 798 Default value is 0.
796 799
797(1) Jiffie: internal timeunit for the kernel. On the i386 1/100s, on the
798Alpha 1/1024s. See the HZ define in /usr/include/asm/param.h for the exact
799value on your system.
800
801Alexey Kuznetsov. 800Alexey Kuznetsov.
802kuznet@ms2.inr.ac.ru 801kuznet@ms2.inr.ac.ru
803 802
@@ -1064,24 +1063,193 @@ bridge-nf-filter-pppoe-tagged - BOOLEAN
1064 Default: 1 1063 Default: 1
1065 1064
1066 1065
1067UNDOCUMENTED: 1066proc/sys/net/sctp/* Variables:
1067
1068addip_enable - BOOLEAN
1069 Enable or disable extension of Dynamic Address Reconfiguration
1070 (ADD-IP) functionality specified in RFC5061. This extension provides
1071 the ability to dynamically add and remove new addresses for the SCTP
1072 associations.
1073
1074 1: Enable extension.
1075
1076 0: Disable extension.
1077
1078 Default: 0
1079
1080addip_noauth_enable - BOOLEAN
1081 Dynamic Address Reconfiguration (ADD-IP) requires the use of
1082 authentication to protect the operations of adding or removing new
1083 addresses. This requirement is mandated so that unauthorized hosts
1084 would not be able to hijack associations. However, older
1085 implementations may not have implemented this requirement while
1086 allowing the ADD-IP extension. For reasons of interoperability,
1087 we provide this variable to control the enforcement of the
1088 authentication requirement.
1089
1090 1: Allow ADD-IP extension to be used without authentication. This
1091 should only be set in a closed environment for interoperability
1092 with older implementations.
1093
1094 0: Enforce the authentication requirement
1095
1096 Default: 0
1097
1098auth_enable - BOOLEAN
1099 Enable or disable Authenticated Chunks extension. This extension
1100 provides the ability to send and receive authenticated chunks and is
1101 required for secure operation of Dynamic Address Reconfiguration
1102 (ADD-IP) extension.
1103
1104 1: Enable this extension.
1105 0: Disable this extension.
1106
1107 Default: 0
1108
1109prsctp_enable - BOOLEAN
1110 Enable or disable the Partial Reliability extension (RFC3758) which
1111 is used to notify peers that a given DATA should no longer be expected.
1112
1113 1: Enable extension
1114 0: Disable
1115
1116 Default: 1
1117
1118max_burst - INTEGER
1119 The limit of the number of new packets that can be initially sent. It
1120 controls how bursty the generated traffic can be.
1121
1122 Default: 4
1123
1124association_max_retrans - INTEGER
1125 Set the maximum number for retransmissions that an association can
1126 attempt deciding that the remote end is unreachable. If this value
1127 is exceeded, the association is terminated.
1128
1129 Default: 10
1130
1131max_init_retransmits - INTEGER
1132 The maximum number of retransmissions of INIT and COOKIE-ECHO chunks
1133 that an association will attempt before declaring the destination
1134 unreachable and terminating.
1135
1136 Default: 8
1137
1138path_max_retrans - INTEGER
1139 The maximum number of retransmissions that will be attempted on a given
1140 path. Once this threshold is exceeded, the path is considered
1141 unreachable, and new traffic will use a different path when the
1142 association is multihomed.
1143
1144 Default: 5
1145
1146rto_initial - INTEGER
1147 The initial round trip timeout value in milliseconds that will be used
1148 in calculating round trip times. This is the initial time interval
1149 for retransmissions.
1068 1150
1069dev_weight FIXME 1151 Default: 3000
1070discovery_slots FIXME 1152
1071discovery_timeout FIXME 1153rto_max - INTEGER
1072fast_poll_increase FIXME 1154 The maximum value (in milliseconds) of the round trip timeout. This
1073ip6_queue_maxlen FIXME 1155 is the largest time interval that can elapse between retransmissions.
1074lap_keepalive_time FIXME 1156
1075lo_cong FIXME 1157 Default: 60000
1076max_baud_rate FIXME 1158
1077max_dgram_qlen FIXME 1159rto_min - INTEGER
1078max_noreply_time FIXME 1160 The minimum value (in milliseconds) of the round trip timeout. This
1079max_tx_data_size FIXME 1161 is the smallest time interval the can elapse between retransmissions.
1080max_tx_window FIXME 1162
1081min_tx_turn_time FIXME 1163 Default: 1000
1082mod_cong FIXME 1164
1083no_cong FIXME 1165hb_interval - INTEGER
1084no_cong_thresh FIXME 1166 The interval (in milliseconds) between HEARTBEAT chunks. These chunks
1085slot_timeout FIXME 1167 are sent at the specified interval on idle paths to probe the state of
1086warn_noreply_time FIXME 1168 a given path between 2 associations.
1169
1170 Default: 30000
1171
1172sack_timeout - INTEGER
1173 The amount of time (in milliseconds) that the implementation will wait
1174 to send a SACK.
1175
1176 Default: 200
1177
1178valid_cookie_life - INTEGER
1179 The default lifetime of the SCTP cookie (in milliseconds). The cookie
1180 is used during association establishment.
1181
1182 Default: 60000
1183
1184cookie_preserve_enable - BOOLEAN
1185 Enable or disable the ability to extend the lifetime of the SCTP cookie
1186 that is used during the establishment phase of SCTP association
1187
1188 1: Enable cookie lifetime extension.
1189 0: Disable
1190
1191 Default: 1
1192
1193rcvbuf_policy - INTEGER
1194 Determines if the receive buffer is attributed to the socket or to
1195 association. SCTP supports the capability to create multiple
1196 associations on a single socket. When using this capability, it is
1197 possible that a single stalled association that's buffering a lot
1198 of data may block other associations from delivering their data by
1199 consuming all of the receive buffer space. To work around this,
1200 the rcvbuf_policy could be set to attribute the receiver buffer space
1201 to each association instead of the socket. This prevents the described
1202 blocking.
1203
1204 1: rcvbuf space is per association
1205 0: recbuf space is per socket
1206
1207 Default: 0
1208
1209sndbuf_policy - INTEGER
1210 Similar to rcvbuf_policy above, this applies to send buffer space.
1211
1212 1: Send buffer is tracked per association
1213 0: Send buffer is tracked per socket.
1214
1215 Default: 0
1216
1217sctp_mem - vector of 3 INTEGERs: min, pressure, max
1218 Number of pages allowed for queueing by all SCTP sockets.
1219
1220 min: Below this number of pages SCTP is not bothered about its
1221 memory appetite. When amount of memory allocated by SCTP exceeds
1222 this number, SCTP starts to moderate memory usage.
1223
1224 pressure: This value was introduced to follow format of tcp_mem.
1225
1226 max: Number of pages allowed for queueing by all SCTP sockets.
1227
1228 Default is calculated at boot time from amount of available memory.
1229
1230sctp_rmem - vector of 3 INTEGERs: min, default, max
1231 See tcp_rmem for a description.
1232
1233sctp_wmem - vector of 3 INTEGERs: min, default, max
1234 See tcp_wmem for a description.
1235
1236UNDOCUMENTED:
1087 1237
1238/proc/sys/net/core/*
1239 dev_weight FIXME
1240
1241/proc/sys/net/unix/*
1242 max_dgram_qlen FIXME
1243
1244/proc/sys/net/irda/*
1245 fast_poll_increase FIXME
1246 warn_noreply_time FIXME
1247 discovery_slots FIXME
1248 slot_timeout FIXME
1249 max_baud_rate FIXME
1250 discovery_timeout FIXME
1251 lap_keepalive_time FIXME
1252 max_noreply_time FIXME
1253 max_tx_data_size FIXME
1254 max_tx_window FIXME
1255 min_tx_turn_time FIXME
diff --git a/Documentation/networking/s2io.txt b/Documentation/networking/s2io.txt
index 4bde53e85f3f..1e28e2ddb90a 100644
--- a/Documentation/networking/s2io.txt
+++ b/Documentation/networking/s2io.txt
@@ -83,9 +83,9 @@ Valid range: Limited by memory on system
83Default: 30 83Default: 30
84 84
85e. intr_type 85e. intr_type
86Specifies interrupt type. Possible values 1(INTA), 2(MSI), 3(MSI-X) 86Specifies interrupt type. Possible values 0(INTA), 2(MSI-X)
87Valid range: 1-3 87Valid values: 0, 2
88Default: 1 88Default: 2
89 89
905. Performance suggestions 905. Performance suggestions
91General: 91General:
diff --git a/Documentation/nmi_watchdog.txt b/Documentation/nmi_watchdog.txt
index 757c729ee42e..90aa4531cb67 100644
--- a/Documentation/nmi_watchdog.txt
+++ b/Documentation/nmi_watchdog.txt
@@ -10,7 +10,7 @@ us to generate 'watchdog NMI interrupts'. (NMI: Non Maskable Interrupt
10which get executed even if the system is otherwise locked up hard). 10which get executed even if the system is otherwise locked up hard).
11This can be used to debug hard kernel lockups. By executing periodic 11This can be used to debug hard kernel lockups. By executing periodic
12NMI interrupts, the kernel can monitor whether any CPU has locked up, 12NMI interrupts, the kernel can monitor whether any CPU has locked up,
13and print out debugging messages if so. 13and print out debugging messages if so.
14 14
15In order to use the NMI watchdog, you need to have APIC support in your 15In order to use the NMI watchdog, you need to have APIC support in your
16kernel. For SMP kernels, APIC support gets compiled in automatically. For 16kernel. For SMP kernels, APIC support gets compiled in automatically. For
@@ -22,8 +22,7 @@ CONFIG_X86_UP_IOAPIC is for uniprocessor with an IO-APIC. [Note: certain
22kernel debugging options, such as Kernel Stack Meter or Kernel Tracer, 22kernel debugging options, such as Kernel Stack Meter or Kernel Tracer,
23may implicitly disable the NMI watchdog.] 23may implicitly disable the NMI watchdog.]
24 24
25For x86-64, the needed APIC is always compiled in, and the NMI watchdog is 25For x86-64, the needed APIC is always compiled in.
26always enabled with I/O-APIC mode (nmi_watchdog=1).
27 26
28Using local APIC (nmi_watchdog=2) needs the first performance register, so 27Using local APIC (nmi_watchdog=2) needs the first performance register, so
29you can't use it for other purposes (such as high precision performance 28you can't use it for other purposes (such as high precision performance
@@ -63,16 +62,15 @@ when the system is idle), but if your system locks up on anything but the
63"hlt", then you are out of luck -- the event will not happen at all and the 62"hlt", then you are out of luck -- the event will not happen at all and the
64watchdog won't trigger. This is a shortcoming of the local APIC watchdog 63watchdog won't trigger. This is a shortcoming of the local APIC watchdog
65-- unfortunately there is no "clock ticks" event that would work all the 64-- unfortunately there is no "clock ticks" event that would work all the
66time. The I/O APIC watchdog is driven externally and has no such shortcoming. 65time. The I/O APIC watchdog is driven externally and has no such shortcoming.
67But its NMI frequency is much higher, resulting in a more significant hit 66But its NMI frequency is much higher, resulting in a more significant hit
68to the overall system performance. 67to the overall system performance.
69 68
70NOTE: starting with 2.4.2-ac18 the NMI-oopser is disabled by default, 69On x86 nmi_watchdog is disabled by default so you have to enable it with
71you have to enable it with a boot time parameter. Prior to 2.4.2-ac18 70a boot time parameter.
72the NMI-oopser is enabled unconditionally on x86 SMP boxes.
73 71
74On x86-64 the NMI oopser is on by default. On 64bit Intel CPUs 72NOTE: In kernels prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally
75it uses IO-APIC by default and on AMD it uses local APIC. 73on x86 SMP boxes.
76 74
77[ feel free to send bug reports, suggestions and patches to 75[ feel free to send bug reports, suggestions and patches to
78 Ingo Molnar <mingo@redhat.com> or the Linux SMP mailing 76 Ingo Molnar <mingo@redhat.com> or the Linux SMP mailing
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index a9e990ab980f..373ceacc367e 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
61arch_init_sched_domains function. This function will attach domains to all 61arch_init_sched_domains function. This function will attach domains to all
62CPUs using cpu_attach_domain. 62CPUs using cpu_attach_domain.
63 63
64Implementors should change the line 64The sched-domains debugging infrastructure can be enabled by enabling
65#undef SCHED_DOMAIN_DEBUG 65CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
66to
67#define SCHED_DOMAIN_DEBUG
68in kernel/sched.c as this enables an error checking parse of the sched domains
69which should catch most possible errors (described above). It also prints out 66which should catch most possible errors (described above). It also prints out
70the domain structure in a visual format. 67the domain structure in a visual format.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 14f901f639ee..3ef339f491e0 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
510.00015s. So this group can be scheduled with a period of 0.005s and a run time 510.00015s. So this group can be scheduled with a period of 0.005s and a run time
52of 0.00015s. 52of 0.00015s.
53 53
54The remaining CPU time will be used for user input and other tass. Because 54The remaining CPU time will be used for user input and other tasks. Because
55realtime tasks have explicitly allocated the CPU time they need to perform 55realtime tasks have explicitly allocated the CPU time they need to perform
56their tasks, buffer underruns in the graphocs or audio can be eliminated. 56their tasks, buffer underruns in the graphics or audio can be eliminated.
57 57
58NOTE: the above example is not fully implemented as of yet (2.6.25). We still 58NOTE: the above example is not fully implemented as of yet (2.6.25). We still
59lack an EDF scheduler to make non-uniform periods usable. 59lack an EDF scheduler to make non-uniform periods usable.
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 0bbee38acd26..72aff61e7315 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -753,8 +753,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
753 753
754 [Multiple options for each card instance] 754 [Multiple options for each card instance]
755 model - force the model name 755 model - force the model name
756 position_fix - Fix DMA pointer (0 = auto, 1 = none, 2 = POSBUF, 3 = FIFO size) 756 position_fix - Fix DMA pointer (0 = auto, 1 = use LPIB, 2 = POSBUF)
757 probe_mask - Bitmask to probe codecs (default = -1, meaning all slots) 757 probe_mask - Bitmask to probe codecs (default = -1, meaning all slots)
758 bdl_pos_adj - Specifies the DMA IRQ timing delay in samples.
759 Passing -1 will make the driver to choose the appropriate
760 value based on the controller chip.
758 761
759 [Single (global) options] 762 [Single (global) options]
760 single_cmd - Use single immediate commands to communicate with 763 single_cmd - Use single immediate commands to communicate with
@@ -845,7 +848,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
845 ALC269 848 ALC269
846 basic Basic preset 849 basic Basic preset
847 850
848 ALC662 851 ALC662/663
849 3stack-dig 3-stack (2-channel) with SPDIF 852 3stack-dig 3-stack (2-channel) with SPDIF
850 3stack-6ch 3-stack (6-channel) 853 3stack-6ch 3-stack (6-channel)
851 3stack-6ch-dig 3-stack (6-channel) with SPDIF 854 3stack-6ch-dig 3-stack (6-channel) with SPDIF
@@ -853,6 +856,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
853 lenovo-101e Lenovo laptop 856 lenovo-101e Lenovo laptop
854 eeepc-p701 ASUS Eeepc P701 857 eeepc-p701 ASUS Eeepc P701
855 eeepc-ep20 ASUS Eeepc EP20 858 eeepc-ep20 ASUS Eeepc EP20
859 m51va ASUS M51VA
860 g71v ASUS G71V
861 h13 ASUS H13
862 g50v ASUS G50V
856 auto auto-config reading BIOS (default) 863 auto auto-config reading BIOS (default)
857 864
858 ALC882/885 865 ALC882/885
@@ -1091,7 +1098,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1091 This occurs when the access to non-existing or non-working codec slot 1098 This occurs when the access to non-existing or non-working codec slot
1092 (likely a modem one) causes a stall of the communication via HD-audio 1099 (likely a modem one) causes a stall of the communication via HD-audio
1093 bus. You can see which codec slots are probed by enabling 1100 bus. You can see which codec slots are probed by enabling
1094 CONFIG_SND_DEBUG_DETECT, or simply from the file name of the codec 1101 CONFIG_SND_DEBUG_VERBOSE, or simply from the file name of the codec
1095 proc files. Then limit the slots to probe by probe_mask option. 1102 proc files. Then limit the slots to probe by probe_mask option.
1096 For example, probe_mask=1 means to probe only the first slot, and 1103 For example, probe_mask=1 means to probe only the first slot, and
1097 probe_mask=4 means only the third slot. 1104 probe_mask=4 means only the third slot.
@@ -2267,6 +2274,10 @@ case above again, the first two slots are already reserved. If any
2267other driver (e.g. snd-usb-audio) is loaded before snd-interwave or 2274other driver (e.g. snd-usb-audio) is loaded before snd-interwave or
2268snd-ens1371, it will be assigned to the third or later slot. 2275snd-ens1371, it will be assigned to the third or later slot.
2269 2276
2277When a module name is given with '!', the slot will be given for any
2278modules but that name. For example, "slots=!snd-pcsp" will reserve
2279the first slot for any modules but snd-pcsp.
2280
2270 2281
2271ALSA PCM devices to OSS devices mapping 2282ALSA PCM devices to OSS devices mapping
2272======================================= 2283=======================================
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index b03df4d4795c..e13c4e67029f 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -6127,8 +6127,8 @@ struct _snd_pcm_runtime {
6127 6127
6128 <para> 6128 <para>
6129 <function>snd_printdd()</function> is compiled in only when 6129 <function>snd_printdd()</function> is compiled in only when
6130 <constant>CONFIG_SND_DEBUG_DETECT</constant> is set. Please note 6130 <constant>CONFIG_SND_DEBUG_VERBOSE</constant> is set. Please note
6131 that <constant>DEBUG_DETECT</constant> is not set as default 6131 that <constant>CONFIG_SND_DEBUG_VERBOSE</constant> is not set as default
6132 even if you configure the alsa-driver with 6132 even if you configure the alsa-driver with
6133 <option>--with-debug=full</option> option. You need to give 6133 <option>--with-debug=full</option> option. You need to give
6134 explicitly <option>--with-debug=detect</option> option instead. 6134 explicitly <option>--with-debug=detect</option> option instead.
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
new file mode 100644
index 000000000000..a4afb560a45b
--- /dev/null
+++ b/Documentation/tracers/mmiotrace.txt
@@ -0,0 +1,164 @@
1 In-kernel memory-mapped I/O tracing
2
3
4Home page and links to optional user space tools:
5
6 http://nouveau.freedesktop.org/wiki/MmioTrace
7
8MMIO tracing was originally developed by Intel around 2003 for their Fault
9Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel,
10Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau
11project in mind. Since then many people have contributed.
12
13Mmiotrace was built for reverse engineering any memory-mapped IO device with
14the Nouveau project as the first real user. Only x86 and x86_64 architectures
15are supported.
16
17Out-of-tree mmiotrace was originally modified for mainline inclusion and
18ftrace framework by Pekka Paalanen <pq@iki.fi>.
19
20
21Preparation
22-----------
23
24Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is
25disabled by default, so it is safe to have this set to yes. SMP systems are
26supported, but tracing is unreliable and may miss events if more than one CPU
27is on-line, therefore mmiotrace takes all but one CPU off-line during run-time
28activation. You can re-enable CPUs by hand, but you have been warned, there
29is no way to automatically detect if you are losing events due to CPUs racing.
30
31
32Usage Quick Reference
33---------------------
34
35$ mount -t debugfs debugfs /debug
36$ echo mmiotrace > /debug/tracing/current_tracer
37$ cat /debug/tracing/trace_pipe > mydump.txt &
38Start X or whatever.
39$ echo "X is up" > /debug/tracing/marker
40$ echo none > /debug/tracing/current_tracer
41Check for lost events.
42
43
44Usage
45-----
46
47Make sure debugfs is mounted to /debug. If not, (requires root privileges)
48$ mount -t debugfs debugfs /debug
49
50Check that the driver you are about to trace is not loaded.
51
52Activate mmiotrace (requires root privileges):
53$ echo mmiotrace > /debug/tracing/current_tracer
54
55Start storing the trace:
56$ cat /debug/tracing/trace_pipe > mydump.txt &
57The 'cat' process should stay running (sleeping) in the background.
58
59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
60accesses to areas that are ioremapped while mmiotrace is active.
61
62[Unimplemented feature:]
63During tracing you can place comments (markers) into the trace by
64$ echo "X is up" > /debug/tracing/marker
65This makes it easier to see which part of the (huge) trace corresponds to
66which action. It is recommended to place descriptive markers about what you
67do.
68
69Shut down mmiotrace (requires root privileges):
70$ echo none > /debug/tracing/current_tracer
71The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
72pressing ctrl+c.
73
74Check that mmiotrace did not lose events due to a buffer filling up. Either
75$ grep -i lost mydump.txt
76which tells you exactly how many events were lost, or use
77$ dmesg
78to view your kernel log and look for "mmiotrace has lost events" warning. If
79events were lost, the trace is incomplete. You should enlarge the buffers and
80try again. Buffers are enlarged by first seeing how large the current buffers
81are:
82$ cat /debug/tracing/trace_entries
83gives you a number. Approximately double this number and write it back, for
84instance:
85$ echo 128000 > /debug/tracing/trace_entries
86Then start again from the top.
87
88If you are doing a trace for a driver project, e.g. Nouveau, you should also
89do the following before sending your results:
90$ lspci -vvv > lspci.txt
91$ dmesg > dmesg.txt
92$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
93and then send the .tar.gz file. The trace compresses considerably. Replace
94"pciid" and "nick" with the PCI ID or model name of your piece of hardware
95under investigation and your nick name.
96
97
98How Mmiotrace Works
99-------------------
100
101Access to hardware IO-memory is gained by mapping addresses from PCI bus by
102calling one of the ioremap_*() functions. Mmiotrace is hooked into the
103__ioremap() function and gets called whenever a mapping is created. Mapping is
104an event that is recorded into the trace log. Note, that ISA range mappings
105are not caught, since the mapping always exists and is returned directly.
106
107MMIO accesses are recorded via page faults. Just before __ioremap() returns,
108the mapped pages are marked as not present. Any access to the pages causes a
109fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace
110marks the page present, sets TF flag to achieve single stepping and exits the
111fault handler. The instruction that faulted is executed and debug trap is
112entered. Here mmiotrace again marks the page as not present. The instruction
113is decoded to get the type of operation (read/write), data width and the value
114read or written. These are stored to the trace log.
115
116Setting the page present in the page fault handler has a race condition on SMP
117machines. During the single stepping other CPUs may run freely on that page
118and events can be missed without a notice. Re-enabling other CPUs during
119tracing is discouraged.
120
121
122Trace Log Format
123----------------
124
125The raw log is text and easily filtered with e.g. grep and awk. One record is
126one line in the log. A record starts with a keyword, followed by keyword
127dependant arguments. Arguments are separated by a space, or continue until the
128end of line. The format for version 20070824 is as follows:
129
130Explanation Keyword Space separated arguments
131---------------------------------------------------------------------------
132
133read event R width, timestamp, map id, physical, value, PC, PID
134write event W width, timestamp, map id, physical, value, PC, PID
135ioremap event MAP timestamp, map id, physical, virtual, length, PC, PID
136iounmap event UNMAP timestamp, map id, PC, PID
137marker MARK timestamp, text
138version VERSION the string "20070824"
139info for reader LSPCI one line from lspci -v
140PCI address map PCIDEV space separated /proc/bus/pci/devices data
141unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID
142
143Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
144is a kernel virtual address. Width is the data width in bytes and value is the
145data value. Map id is an arbitrary id number identifying the mapping that was
146used in an operation. PC is the program counter and PID is process id. PC is
147zero if it is not recorded. PID is always zero as tracing MMIO accesses
148originating in user space memory is not yet supported.
149
150For instance, the following awk filter will pass all 32-bit writes that target
151physical addresses in the range [0xfb73ce40, 0xfb800000[
152
153$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 &&
154adr < 0xfb800000) print; }'
155
156
157Tools for Developers
158--------------------
159
160The user space tools include utilities for:
161- replacing numeric addresses and values with hardware register names
162- replaying MMIO logs, i.e., re-executing the recorded writes
163
164
diff --git a/Documentation/video4linux/CARDLIST.au0828 b/Documentation/video4linux/CARDLIST.au0828
index aaae360312e4..86d1c8e7b18f 100644
--- a/Documentation/video4linux/CARDLIST.au0828
+++ b/Documentation/video4linux/CARDLIST.au0828
@@ -1,4 +1,4 @@
1 0 -> Unknown board (au0828) 1 0 -> Unknown board (au0828)
2 1 -> Hauppauge HVR950Q (au0828) [2040:7200] 2 1 -> Hauppauge HVR950Q (au0828) [2040:7200,2040:7210,2040:7217,2040:721b,2040:721f,2040:7280,0fd9:0008]
3 2 -> Hauppauge HVR850 (au0828) [2040:7240] 3 2 -> Hauppauge HVR850 (au0828) [2040:7240]
4 3 -> DViCO FusionHDTV USB (au0828) [0fe9:d620] 4 3 -> DViCO FusionHDTV USB (au0828) [0fe9:d620]
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index e4230ed16ee7..df3227605d59 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Slabinfo: Tool to get reports about slabs 2 * Slabinfo: Tool to get reports about slabs
3 * 3 *
4 * (C) 2007 sgi, Christoph Lameter <clameter@sgi.com> 4 * (C) 2007 sgi, Christoph Lameter
5 * 5 *
6 * Compile by: 6 * Compile by:
7 * 7 *
@@ -99,7 +99,7 @@ void fatal(const char *x, ...)
99 99
100void usage(void) 100void usage(void)
101{ 101{
102 printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n" 102 printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n"
103 "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" 103 "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
104 "-a|--aliases Show aliases\n" 104 "-a|--aliases Show aliases\n"
105 "-A|--activity Most active slabs first\n" 105 "-A|--activity Most active slabs first\n"
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index 7c13f22a0c9e..bb1f5c6e28b3 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -266,4 +266,4 @@ of other objects.
266 266
267 slub_debug=FZ,dentry 267 slub_debug=FZ,dentry
268 268
269Christoph Lameter, <clameter@sgi.com>, May 30, 2007 269Christoph Lameter, May 30, 2007
diff --git a/Documentation/i386/IO-APIC.txt b/Documentation/x86/i386/IO-APIC.txt
index 30b4c714fbe1..30b4c714fbe1 100644
--- a/Documentation/i386/IO-APIC.txt
+++ b/Documentation/x86/i386/IO-APIC.txt
diff --git a/Documentation/i386/boot.txt b/Documentation/x86/i386/boot.txt
index 95ad15c3b01f..147bfe511cdd 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/x86/i386/boot.txt
@@ -1,17 +1,14 @@
1 THE LINUX/I386 BOOT PROTOCOL 1 THE LINUX/x86 BOOT PROTOCOL
2 ---------------------------- 2 ---------------------------
3 3
4 H. Peter Anvin <hpa@zytor.com> 4On the x86 platform, the Linux kernel uses a rather complicated boot
5 Last update 2007-05-23
6
7On the i386 platform, the Linux kernel uses a rather complicated boot
8convention. This has evolved partially due to historical aspects, as 5convention. This has evolved partially due to historical aspects, as
9well as the desire in the early days to have the kernel itself be a 6well as the desire in the early days to have the kernel itself be a
10bootable image, the complicated PC memory model and due to changed 7bootable image, the complicated PC memory model and due to changed
11expectations in the PC industry caused by the effective demise of 8expectations in the PC industry caused by the effective demise of
12real-mode DOS as a mainstream operating system. 9real-mode DOS as a mainstream operating system.
13 10
14Currently, the following versions of the Linux/i386 boot protocol exist. 11Currently, the following versions of the Linux/x86 boot protocol exist.
15 12
16Old kernels: zImage/Image support only. Some very early kernels 13Old kernels: zImage/Image support only. Some very early kernels
17 may not even support a command line. 14 may not even support a command line.
@@ -372,10 +369,17 @@ Protocol: 2.00+
372 - If 0, the protected-mode code is loaded at 0x10000. 369 - If 0, the protected-mode code is loaded at 0x10000.
373 - If 1, the protected-mode code is loaded at 0x100000. 370 - If 1, the protected-mode code is loaded at 0x100000.
374 371
372 Bit 5 (write): QUIET_FLAG
373 - If 0, print early messages.
374 - If 1, suppress early messages.
375 This requests to the kernel (decompressor and early
376 kernel) to not write early messages that require
377 accessing the display hardware directly.
378
375 Bit 6 (write): KEEP_SEGMENTS 379 Bit 6 (write): KEEP_SEGMENTS
376 Protocol: 2.07+ 380 Protocol: 2.07+
377 - if 0, reload the segment registers in the 32bit entry point. 381 - If 0, reload the segment registers in the 32bit entry point.
378 - if 1, do not reload the segment registers in the 32bit entry point. 382 - If 1, do not reload the segment registers in the 32bit entry point.
379 Assume that %cs %ds %ss %es are all set to flat segments with 383 Assume that %cs %ds %ss %es are all set to flat segments with
380 a base of 0 (or the equivalent for their environment). 384 a base of 0 (or the equivalent for their environment).
381 385
@@ -504,7 +508,7 @@ Protocol: 2.06+
504 maximum size was 255. 508 maximum size was 255.
505 509
506Field name: hardware_subarch 510Field name: hardware_subarch
507Type: write 511Type: write (optional, defaults to x86/PC)
508Offset/size: 0x23c/4 512Offset/size: 0x23c/4
509Protocol: 2.07+ 513Protocol: 2.07+
510 514
@@ -520,11 +524,13 @@ Protocol: 2.07+
520 0x00000002 Xen 524 0x00000002 Xen
521 525
522Field name: hardware_subarch_data 526Field name: hardware_subarch_data
523Type: write 527Type: write (subarch-dependent)
524Offset/size: 0x240/8 528Offset/size: 0x240/8
525Protocol: 2.07+ 529Protocol: 2.07+
526 530
527 A pointer to data that is specific to hardware subarch 531 A pointer to data that is specific to hardware subarch
532 This field is currently unused for the default x86/PC environment,
533 do not modify.
528 534
529Field name: payload_offset 535Field name: payload_offset
530Type: read 536Type: read
@@ -545,6 +551,34 @@ Protocol: 2.08+
545 551
546 The length of the payload. 552 The length of the payload.
547 553
554Field name: setup_data
555Type: write (special)
556Offset/size: 0x250/8
557Protocol: 2.09+
558
559 The 64-bit physical pointer to NULL terminated single linked list of
560 struct setup_data. This is used to define a more extensible boot
561 parameters passing mechanism. The definition of struct setup_data is
562 as follow:
563
564 struct setup_data {
565 u64 next;
566 u32 type;
567 u32 len;
568 u8 data[0];
569 };
570
571 Where, the next is a 64-bit physical pointer to the next node of
572 linked list, the next field of the last node is 0; the type is used
573 to identify the contents of data; the len is the length of data
574 field; the data holds the real payload.
575
576 This list may be modified at a number of points during the bootup
577 process. Therefore, when modifying this list one should always make
578 sure to consider the case where the linked list already contains
579 entries.
580
581
548**** THE IMAGE CHECKSUM 582**** THE IMAGE CHECKSUM
549 583
550From boot protocol version 2.08 onwards the CRC-32 is calculated over 584From boot protocol version 2.08 onwards the CRC-32 is calculated over
@@ -553,6 +587,7 @@ initial remainder of 0xffffffff. The checksum is appended to the
553file; therefore the CRC of the file up to the limit specified in the 587file; therefore the CRC of the file up to the limit specified in the
554syssize field of the header is always 0. 588syssize field of the header is always 0.
555 589
590
556**** THE KERNEL COMMAND LINE 591**** THE KERNEL COMMAND LINE
557 592
558The kernel command line has become an important way for the boot 593The kernel command line has become an important way for the boot
@@ -584,28 +619,6 @@ command line is entered using the following protocol:
584 covered by setup_move_size, so you may need to adjust this 619 covered by setup_move_size, so you may need to adjust this
585 field. 620 field.
586 621
587Field name: setup_data
588Type: write (obligatory)
589Offset/size: 0x250/8
590Protocol: 2.09+
591
592 The 64-bit physical pointer to NULL terminated single linked list of
593 struct setup_data. This is used to define a more extensible boot
594 parameters passing mechanism. The definition of struct setup_data is
595 as follow:
596
597 struct setup_data {
598 u64 next;
599 u32 type;
600 u32 len;
601 u8 data[0];
602 };
603
604 Where, the next is a 64-bit physical pointer to the next node of
605 linked list, the next field of the last node is 0; the type is used
606 to identify the contents of data; the len is the length of data
607 field; the data holds the real payload.
608
609 622
610**** MEMORY LAYOUT OF THE REAL-MODE CODE 623**** MEMORY LAYOUT OF THE REAL-MODE CODE
611 624
diff --git a/Documentation/i386/usb-legacy-support.txt b/Documentation/x86/i386/usb-legacy-support.txt
index 1894cdfc69d9..1894cdfc69d9 100644
--- a/Documentation/i386/usb-legacy-support.txt
+++ b/Documentation/x86/i386/usb-legacy-support.txt
diff --git a/Documentation/i386/zero-page.txt b/Documentation/x86/i386/zero-page.txt
index 169ad423a3d1..169ad423a3d1 100644
--- a/Documentation/i386/zero-page.txt
+++ b/Documentation/x86/i386/zero-page.txt
diff --git a/Documentation/x86_64/00-INDEX b/Documentation/x86/x86_64/00-INDEX
index 92fc20ab5f0e..92fc20ab5f0e 100644
--- a/Documentation/x86_64/00-INDEX
+++ b/Documentation/x86/x86_64/00-INDEX
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index b0c7b6c4abda..b0c7b6c4abda 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
diff --git a/Documentation/x86_64/cpu-hotplug-spec b/Documentation/x86/x86_64/cpu-hotplug-spec
index 3c23e0587db3..3c23e0587db3 100644
--- a/Documentation/x86_64/cpu-hotplug-spec
+++ b/Documentation/x86/x86_64/cpu-hotplug-spec
diff --git a/Documentation/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets
index d1a985c5b00a..d1a985c5b00a 100644
--- a/Documentation/x86_64/fake-numa-for-cpusets
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets
diff --git a/Documentation/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks
index 5ad65d51fb95..5ad65d51fb95 100644
--- a/Documentation/x86_64/kernel-stacks
+++ b/Documentation/x86/x86_64/kernel-stacks
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
index a05e58e7b159..a05e58e7b159 100644
--- a/Documentation/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index b89b6d2bebfa..efce75097369 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -11,9 +11,8 @@ ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space 11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
12ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) 12ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
13... unused hole ... 13... unused hole ...
14ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0 14ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
15... unused hole ... 15ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
16ffffffff88000000 - fffffffffff00000 (=1919 MB) module mapping space
17 16
18The direct mapping covers all memory in the system up to the highest 17The direct mapping covers all memory in the system up to the highest
19memory address (this means in some cases it can also include PCI memory 18memory address (this means in some cases it can also include PCI memory
diff --git a/Documentation/x86_64/uefi.txt b/Documentation/x86/x86_64/uefi.txt
index 7d77120a5184..a5e2b4fdb170 100644
--- a/Documentation/x86_64/uefi.txt
+++ b/Documentation/x86/x86_64/uefi.txt
@@ -36,3 +36,7 @@ Mechanics:
36 services. 36 services.
37 noefi turn off all EFI runtime services 37 noefi turn off all EFI runtime services
38 reboot_type=k turn off EFI reboot runtime service 38 reboot_type=k turn off EFI reboot runtime service
39- If the EFI memory map has additional entries not in the E820 map,
40 you can include those entries in the kernels memory map of available
41 physical RAM by using the following kernel command line parameter.
42 add_efi_memmap include EFI memory map of available physical RAM