aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorJaroslav Kysela <perex@perex.cz>2010-02-16 05:19:18 -0500
committerJaroslav Kysela <perex@perex.cz>2010-02-16 05:19:18 -0500
commitba9341dfef6b0201cd30e3904dcd0a47d3dc35e0 (patch)
treed83637979db83bb9d5a23e190148b90b60c976d2 /Documentation
parentd39e82db73eb876c60d00f00219d767b3be30307 (diff)
parentf167e1d073278fe231bbdd5d6c24fb9d091aa544 (diff)
Merge branch 'fixes' into devel
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DocBook/mtdnand.tmpl12
-rw-r--r--Documentation/IO-mapping.txt2
-rw-r--r--Documentation/PCI/PCI-DMA-mapping.txt (renamed from Documentation/DMA-mapping.txt)0
-rw-r--r--Documentation/block/00-INDEX2
-rw-r--r--Documentation/block/as-iosched.txt172
-rw-r--r--Documentation/block/biodoc.txt2
-rw-r--r--Documentation/filesystems/ext4.txt2
-rw-r--r--Documentation/filesystems/nilfs2.txt2
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/kvm/api.txt10
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt58
-rw-r--r--Documentation/sound/alsa/Procfile.txt2
-rw-r--r--Documentation/trace/ftrace-design.txt14
-rw-r--r--Documentation/trace/mmiotrace.txt15
-rw-r--r--Documentation/trace/tracepoint-analysis.txt60
-rw-r--r--Documentation/vgaarbiter.txt2
16 files changed, 120 insertions, 240 deletions
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index f508a8a27fea..5e7d84b48505 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -174,7 +174,7 @@
174 </para> 174 </para>
175 <programlisting> 175 <programlisting>
176static struct mtd_info *board_mtd; 176static struct mtd_info *board_mtd;
177static unsigned long baseaddr; 177static void __iomem *baseaddr;
178 </programlisting> 178 </programlisting>
179 <para> 179 <para>
180 Static example 180 Static example
@@ -182,7 +182,7 @@ static unsigned long baseaddr;
182 <programlisting> 182 <programlisting>
183static struct mtd_info board_mtd; 183static struct mtd_info board_mtd;
184static struct nand_chip board_chip; 184static struct nand_chip board_chip;
185static unsigned long baseaddr; 185static void __iomem *baseaddr;
186 </programlisting> 186 </programlisting>
187 </sect1> 187 </sect1>
188 <sect1 id="Partition_defines"> 188 <sect1 id="Partition_defines">
@@ -283,8 +283,8 @@ int __init board_init (void)
283 } 283 }
284 284
285 /* map physical address */ 285 /* map physical address */
286 baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); 286 baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
287 if(!baseaddr){ 287 if (!baseaddr) {
288 printk("Ioremap to access NAND chip failed\n"); 288 printk("Ioremap to access NAND chip failed\n");
289 err = -EIO; 289 err = -EIO;
290 goto out_mtd; 290 goto out_mtd;
@@ -316,7 +316,7 @@ int __init board_init (void)
316 goto out; 316 goto out;
317 317
318out_ior: 318out_ior:
319 iounmap((void *)baseaddr); 319 iounmap(baseaddr);
320out_mtd: 320out_mtd:
321 kfree (board_mtd); 321 kfree (board_mtd);
322out: 322out:
@@ -341,7 +341,7 @@ static void __exit board_cleanup (void)
341 nand_release (board_mtd); 341 nand_release (board_mtd);
342 342
343 /* unmap physical address */ 343 /* unmap physical address */
344 iounmap((void *)baseaddr); 344 iounmap(baseaddr);
345 345
346 /* Free the MTD device structure */ 346 /* Free the MTD device structure */
347 kfree (board_mtd); 347 kfree (board_mtd);
diff --git a/Documentation/IO-mapping.txt b/Documentation/IO-mapping.txt
index 78a440695e11..1b5aa10df845 100644
--- a/Documentation/IO-mapping.txt
+++ b/Documentation/IO-mapping.txt
@@ -157,7 +157,7 @@ For such memory, you can do things like
157 * access only the 640k-1MB area, so anything else 157 * access only the 640k-1MB area, so anything else
158 * has to be remapped. 158 * has to be remapped.
159 */ 159 */
160 char * baseptr = ioremap(0xFC000000, 1024*1024); 160 void __iomem *baseptr = ioremap(0xFC000000, 1024*1024);
161 161
162 /* write a 'A' to the offset 10 of the area */ 162 /* write a 'A' to the offset 10 of the area */
163 writeb('A',baseptr+10); 163 writeb('A',baseptr+10);
diff --git a/Documentation/DMA-mapping.txt b/Documentation/PCI/PCI-DMA-mapping.txt
index ecad88d9fe59..ecad88d9fe59 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/PCI/PCI-DMA-mapping.txt
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index 961a0513f8c3..a406286f6f3e 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
100-INDEX 100-INDEX
2 - This file 2 - This file
3as-iosched.txt
4 - Anticipatory IO scheduler
5barrier.txt 3barrier.txt
6 - I/O Barriers 4 - I/O Barriers
7biodoc.txt 5biodoc.txt
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
deleted file mode 100644
index 738b72be128e..000000000000
--- a/Documentation/block/as-iosched.txt
+++ /dev/null
@@ -1,172 +0,0 @@
1Anticipatory IO scheduler
2-------------------------
3Nick Piggin <piggin@cyberone.com.au> 13 Sep 2003
4
5Attention! Database servers, especially those using "TCQ" disks should
6investigate performance with the 'deadline' IO scheduler. Any system with high
7disk performance requirements should do so, in fact.
8
9If you see unusual performance characteristics of your disk systems, or you
10see big performance regressions versus the deadline scheduler, please email
11me. Database users don't bother unless you're willing to test a lot of patches
12from me ;) its a known issue.
13
14Also, users with hardware RAID controllers, doing striping, may find
15highly variable performance results with using the as-iosched. The
16as-iosched anticipatory implementation is based on the notion that a disk
17device has only one physical seeking head. A striped RAID controller
18actually has a head for each physical device in the logical RAID device.
19
20However, setting the antic_expire (see tunable parameters below) produces
21very similar behavior to the deadline IO scheduler.
22
23Selecting IO schedulers
24-----------------------
25Refer to Documentation/block/switching-sched.txt for information on
26selecting an io scheduler on a per-device basis.
27
28Anticipatory IO scheduler Policies
29----------------------------------
30The as-iosched implementation implements several layers of policies
31to determine when an IO request is dispatched to the disk controller.
32Here are the policies outlined, in order of application.
33
341. one-way Elevator algorithm.
35
36The elevator algorithm is similar to that used in deadline scheduler, with
37the addition that it allows limited backward movement of the elevator
38(i.e. seeks backwards). A seek backwards can occur when choosing between
39two IO requests where one is behind the elevator's current position, and
40the other is in front of the elevator's position. If the seek distance to
41the request in back of the elevator is less than half the seek distance to
42the request in front of the elevator, then the request in back can be chosen.
43Backward seeks are also limited to a maximum of MAXBACK (1024*1024) sectors.
44This favors forward movement of the elevator, while allowing opportunistic
45"short" backward seeks.
46
472. FIFO expiration times for reads and for writes.
48
49This is again very similar to the deadline IO scheduler. The expiration
50times for requests on these lists is tunable using the parameters read_expire
51and write_expire discussed below. When a read or a write expires in this way,
52the IO scheduler will interrupt its current elevator sweep or read anticipation
53to service the expired request.
54
553. Read and write request batching
56
57A batch is a collection of read requests or a collection of write
58requests. The as scheduler alternates dispatching read and write batches
59to the driver. In the case a read batch, the scheduler submits read
60requests to the driver as long as there are read requests to submit, and
61the read batch time limit has not been exceeded (read_batch_expire).
62The read batch time limit begins counting down only when there are
63competing write requests pending.
64
65In the case of a write batch, the scheduler submits write requests to
66the driver as long as there are write requests available, and the
67write batch time limit has not been exceeded (write_batch_expire).
68However, the length of write batches will be gradually shortened
69when read batches frequently exceed their time limit.
70
71When changing between batch types, the scheduler waits for all requests
72from the previous batch to complete before scheduling requests for the
73next batch.
74
75The read and write fifo expiration times described in policy 2 above
76are checked only when in scheduling IO of a batch for the corresponding
77(read/write) type. So for example, the read FIFO timeout values are
78tested only during read batches. Likewise, the write FIFO timeout
79values are tested only during write batches. For this reason,
80it is generally not recommended for the read batch time
81to be longer than the write expiration time, nor for the write batch
82time to exceed the read expiration time (see tunable parameters below).
83
84When the IO scheduler changes from a read to a write batch,
85it begins the elevator from the request that is on the head of the
86write expiration FIFO. Likewise, when changing from a write batch to
87a read batch, scheduler begins the elevator from the first entry
88on the read expiration FIFO.
89
904. Read anticipation.
91
92Read anticipation occurs only when scheduling a read batch.
93This implementation of read anticipation allows only one read request
94to be dispatched to the disk controller at a time. In
95contrast, many write requests may be dispatched to the disk controller
96at a time during a write batch. It is this characteristic that can make
97the anticipatory scheduler perform anomalously with controllers supporting
98TCQ, or with hardware striped RAID devices. Setting the antic_expire
99queue parameter (see below) to zero disables this behavior, and the
100anticipatory scheduler behaves essentially like the deadline scheduler.
101
102When read anticipation is enabled (antic_expire is not zero), reads
103are dispatched to the disk controller one at a time.
104At the end of each read request, the IO scheduler examines its next
105candidate read request from its sorted read list. If that next request
106is from the same process as the request that just completed,
107or if the next request in the queue is "very close" to the
108just completed request, it is dispatched immediately. Otherwise,
109statistics (average think time, average seek distance) on the process
110that submitted the just completed request are examined. If it seems
111likely that that process will submit another request soon, and that
112request is likely to be near the just completed request, then the IO
113scheduler will stop dispatching more read requests for up to (antic_expire)
114milliseconds, hoping that process will submit a new request near the one
115that just completed. If such a request is made, then it is dispatched
116immediately. If the antic_expire wait time expires, then the IO scheduler
117will dispatch the next read request from the sorted read queue.
118
119To decide whether an anticipatory wait is worthwhile, the scheduler
120maintains statistics for each process that can be used to compute
121mean "think time" (the time between read requests), and mean seek
122distance for that process. One observation is that these statistics
123are associated with each process, but those statistics are not associated
124with a specific IO device. So for example, if a process is doing IO
125on several file systems on separate devices, the statistics will be
126a combination of IO behavior from all those devices.
127
128
129Tuning the anticipatory IO scheduler
130------------------------------------
131When using 'as', the anticipatory IO scheduler there are 5 parameters under
132/sys/block/*/queue/iosched/. All are units of milliseconds.
133
134The parameters are:
135* read_expire
136 Controls how long until a read request becomes "expired". It also controls the
137 interval between which expired requests are served, so set to 50, a request
138 might take anywhere < 100ms to be serviced _if_ it is the next on the
139 expired list. Obviously request expiration strategies won't make the disk
140 go faster. The result basically equates to the timeslice a single reader
141 gets in the presence of other IO. 100*((seek time / read_expire) + 1) is
142 very roughly the % streaming read efficiency your disk should get with
143 multiple readers.
144
145* read_batch_expire
146 Controls how much time a batch of reads is given before pending writes are
147 served. A higher value is more efficient. This might be set below read_expire
148 if writes are to be given higher priority than reads, but reads are to be
149 as efficient as possible when there are no writes. Generally though, it
150 should be some multiple of read_expire.
151
152* write_expire, and
153* write_batch_expire are equivalent to the above, for writes.
154
155* antic_expire
156 Controls the maximum amount of time we can anticipate a good read (one
157 with a short seek distance from the most recently completed request) before
158 giving up. Many other factors may cause anticipation to be stopped early,
159 or some processes will not be "anticipated" at all. Should be a bit higher
160 for big seek time devices though not a linear correspondence - most
161 processes have only a few ms thinktime.
162
163In addition to the tunables above there is a read-only file named est_time
164which, when read, will show:
165
166 - The probability of a task exiting without a cooperating task
167 submitting an anticipated IO.
168
169 - The current mean think time.
170
171 - The seek distance used to determine if an incoming IO is better.
172
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 8d2158a1c6aa..6fab97ea7e6b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
186do not have a corresponding kernel virtual address space mapping) and 186do not have a corresponding kernel virtual address space mapping) and
187low-memory pages. 187low-memory pages.
188 188
189Note: Please refer to Documentation/DMA-mapping.txt for a discussion 189Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion
190on PCI high mem DMA aspects and mapping of scatter gather lists, and support 190on PCI high mem DMA aspects and mapping of scatter gather lists, and support
191for 64 bit PCI. 191for 64 bit PCI.
192 192
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index af6885c3c821..e1def1786e50 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -196,7 +196,7 @@ nobarrier This also requires an IO stack which can support
196 also be used to enable or disable barriers, for 196 also be used to enable or disable barriers, for
197 consistency with other ext4 mount options. 197 consistency with other ext4 mount options.
198 198
199inode_readahead=n This tuning parameter controls the maximum 199inode_readahead_blks=n This tuning parameter controls the maximum
200 number of inode table blocks that ext4's inode 200 number of inode table blocks that ext4's inode
201 table readahead algorithm will pre-read into 201 table readahead algorithm will pre-read into
202 the buffer cache. The default value is 32 blocks. 202 the buffer cache. The default value is 32 blocks.
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 4949fcaa6b6a..839efd8a8a8c 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -28,7 +28,7 @@ described in the man pages included in the package.
28Project web page: http://www.nilfs.org/en/ 28Project web page: http://www.nilfs.org/en/
29Download page: http://www.nilfs.org/en/download.html 29Download page: http://www.nilfs.org/en/download.html
30Git tree web page: http://www.nilfs.org/git/ 30Git tree web page: http://www.nilfs.org/git/
31NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users 31List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
32 32
33Caveats 33Caveats
34======= 34=======
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5ba4d9dff113..736d45602886 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -240,7 +240,7 @@ and is between 256 and 4096 characters. It is defined in the file
240 240
241 acpi_sleep= [HW,ACPI] Sleep options 241 acpi_sleep= [HW,ACPI] Sleep options
242 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, 242 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
243 old_ordering, s4_nonvs } 243 old_ordering, s4_nonvs, sci_force_enable }
244 See Documentation/power/video.txt for information on 244 See Documentation/power/video.txt for information on
245 s3_bios and s3_mode. 245 s3_bios and s3_mode.
246 s3_beep is for debugging; it makes the PC's speaker beep 246 s3_beep is for debugging; it makes the PC's speaker beep
@@ -253,6 +253,9 @@ and is between 256 and 4096 characters. It is defined in the file
253 of _PTS is used by default). 253 of _PTS is used by default).
254 s4_nonvs prevents the kernel from saving/restoring the 254 s4_nonvs prevents the kernel from saving/restoring the
255 ACPI NVS memory during hibernation. 255 ACPI NVS memory during hibernation.
256 sci_force_enable causes the kernel to set SCI_EN directly
257 on resume from S1/S3 (which is against the ACPI spec,
258 but some broken systems don't work without it).
256 259
257 acpi_use_timer_override [HW,ACPI] 260 acpi_use_timer_override [HW,ACPI]
258 Use timer override. For some broken Nvidia NF5 boards 261 Use timer override. For some broken Nvidia NF5 boards
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a114161027..2811e452f756 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -685,7 +685,7 @@ struct kvm_vcpu_events {
685 __u8 pad; 685 __u8 pad;
686 } nmi; 686 } nmi;
687 __u32 sipi_vector; 687 __u32 sipi_vector;
688 __u32 flags; /* must be zero */ 688 __u32 flags;
689}; 689};
690 690
6914.30 KVM_SET_VCPU_EVENTS 6914.30 KVM_SET_VCPU_EVENTS
@@ -701,6 +701,14 @@ vcpu.
701 701
702See KVM_GET_VCPU_EVENTS for the data structure. 702See KVM_GET_VCPU_EVENTS for the data structure.
703 703
704Fields that may be modified asynchronously by running VCPUs can be excluded
705from the update. These fields are nmi.pending and sipi_vector. Keep the
706corresponding bits in the flags field cleared to suppress overwriting the
707current in-kernel state. The bits are:
708
709KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
710KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
711
704 712
7055. The kvm_run structure 7135. The kvm_run structure
706 714
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 169091f75e6d..75afa1229fd7 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -1092,8 +1092,8 @@ WARNING:
1092 its level up and down at every change. 1092 its level up and down at every change.
1093 1093
1094 1094
1095Volume control 1095Volume control (Console Audio control)
1096-------------- 1096--------------------------------------
1097 1097
1098procfs: /proc/acpi/ibm/volume 1098procfs: /proc/acpi/ibm/volume
1099ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC" 1099ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC"
@@ -1110,9 +1110,53 @@ the desktop environment to just provide on-screen-display feedback.
1110Software volume control should be done only in the main AC97/HDA 1110Software volume control should be done only in the main AC97/HDA
1111mixer. 1111mixer.
1112 1112
1113This feature allows volume control on ThinkPad models with a digital 1113
1114volume knob (when available, not all models have it), as well as 1114About the ThinkPad Console Audio control:
1115mute/unmute control. The available commands are: 1115
1116ThinkPads have a built-in amplifier and muting circuit that drives the
1117console headphone and speakers. This circuit is after the main AC97
1118or HDA mixer in the audio path, and under exclusive control of the
1119firmware.
1120
1121ThinkPads have three special hotkeys to interact with the console
1122audio control: volume up, volume down and mute.
1123
1124It is worth noting that the normal way the mute function works (on
1125ThinkPads that do not have a "mute LED") is:
1126
11271. Press mute to mute. It will *always* mute, you can press it as
1128 many times as you want, and the sound will remain mute.
1129
11302. Press either volume key to unmute the ThinkPad (it will _not_
1131 change the volume, it will just unmute).
1132
1133This is a very superior design when compared to the cheap software-only
1134mute-toggle solution found on normal consumer laptops: you can be
1135absolutely sure the ThinkPad will not make noise if you press the mute
1136button, no matter the previous state.
1137
1138The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain
1139amplifiers driving the speakers and headphone output, and the firmware
1140also handles volume control for the headphone and speakers on these
1141ThinkPads without any help from the operating system (this volume
1142control stage exists after the main AC97 or HDA mixer in the audio
1143path).
1144
1145The newer Lenovo models only have firmware mute control, and depend on
1146the main HDA mixer to do volume control (which is done by the operating
1147system). In this case, the volume keys are filtered out for unmute
1148key press (there are some firmware bugs in this area) and delivered as
1149normal key presses to the operating system (thinkpad-acpi is not
1150involved).
1151
1152
1153The ThinkPad-ACPI volume control:
1154
1155The preferred way to interact with the Console Audio control is the
1156ALSA interface.
1157
1158The legacy procfs interface allows one to read the current state,
1159and if volume control is enabled, accepts the following commands:
1116 1160
1117 echo up >/proc/acpi/ibm/volume 1161 echo up >/proc/acpi/ibm/volume
1118 echo down >/proc/acpi/ibm/volume 1162 echo down >/proc/acpi/ibm/volume
@@ -1121,12 +1165,10 @@ mute/unmute control. The available commands are:
1121 echo 'level <level>' >/proc/acpi/ibm/volume 1165 echo 'level <level>' >/proc/acpi/ibm/volume
1122 1166
1123The <level> number range is 0 to 14 although not all of them may be 1167The <level> number range is 0 to 14 although not all of them may be
1124distinct. The unmute the volume after the mute command, use either the 1168distinct. To unmute the volume after the mute command, use either the
1125up or down command (the level command will not unmute the volume), or 1169up or down command (the level command will not unmute the volume), or
1126the unmute command. 1170the unmute command.
1127 1171
1128The current volume level and mute state is shown in the file.
1129
1130You can use the volume_capabilities parameter to tell the driver 1172You can use the volume_capabilities parameter to tell the driver
1131whether your thinkpad has volume control or mute-only control: 1173whether your thinkpad has volume control or mute-only control:
1132volume_capabilities=1 for mixers with mute and volume control, 1174volume_capabilities=1 for mixers with mute and volume control,
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index 719a819f8cc2..07301de12cc4 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -95,7 +95,7 @@ card*/pcm*/xrun_debug
95 It takes an integer value, can be changed by writing to this 95 It takes an integer value, can be changed by writing to this
96 file, such as 96 file, such as
97 97
98 # cat 5 > /proc/asound/card0/pcm0p/xrun_debug 98 # echo 5 > /proc/asound/card0/pcm0p/xrun_debug
99 99
100 The value consists of the following bit flags: 100 The value consists of the following bit flags:
101 bit 0 = Enable XRUN/jiffies debug messages 101 bit 0 = Enable XRUN/jiffies debug messages
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 641a1ef2a7ff..239f14b2b55a 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -53,14 +53,14 @@ size of the mcount call that is embedded in the function).
53For example, if the function foo() calls bar(), when the bar() function calls 53For example, if the function foo() calls bar(), when the bar() function calls
54mcount(), the arguments mcount() will pass to the tracer are: 54mcount(), the arguments mcount() will pass to the tracer are:
55 "frompc" - the address bar() will use to return to foo() 55 "frompc" - the address bar() will use to return to foo()
56 "selfpc" - the address bar() (with _mcount() size adjustment) 56 "selfpc" - the address bar() (with mcount() size adjustment)
57 57
58Also keep in mind that this mcount function will be called *a lot*, so 58Also keep in mind that this mcount function will be called *a lot*, so
59optimizing for the default case of no tracer will help the smooth running of 59optimizing for the default case of no tracer will help the smooth running of
60your system when tracing is disabled. So the start of the mcount function is 60your system when tracing is disabled. So the start of the mcount function is
61typically the bare min with checking things before returning. That also means 61typically the bare minimum with checking things before returning. That also
62the code flow should usually kept linear (i.e. no branching in the nop case). 62means the code flow should usually be kept linear (i.e. no branching in the nop
63This is of course an optimization and not a hard requirement. 63case). This is of course an optimization and not a hard requirement.
64 64
65Here is some pseudo code that should help (these functions should actually be 65Here is some pseudo code that should help (these functions should actually be
66implemented in assembly): 66implemented in assembly):
@@ -131,10 +131,10 @@ some functions to save (hijack) and restore the return address.
131 131
132The mcount function should check the function pointers ftrace_graph_return 132The mcount function should check the function pointers ftrace_graph_return
133(compare to ftrace_stub) and ftrace_graph_entry (compare to 133(compare to ftrace_stub) and ftrace_graph_entry (compare to
134ftrace_graph_entry_stub). If either of those are not set to the relevant stub 134ftrace_graph_entry_stub). If either of those is not set to the relevant stub
135function, call the arch-specific function ftrace_graph_caller which in turn 135function, call the arch-specific function ftrace_graph_caller which in turn
136calls the arch-specific function prepare_ftrace_return. Neither of these 136calls the arch-specific function prepare_ftrace_return. Neither of these
137function names are strictly required, but you should use them anyways to stay 137function names is strictly required, but you should use them anyway to stay
138consistent across the architecture ports -- easier to compare & contrast 138consistent across the architecture ports -- easier to compare & contrast
139things. 139things.
140 140
@@ -144,7 +144,7 @@ but the first argument should be a pointer to the "frompc". Typically this is
144located on the stack. This allows the function to hijack the return address 144located on the stack. This allows the function to hijack the return address
145temporarily to have it point to the arch-specific function return_to_handler. 145temporarily to have it point to the arch-specific function return_to_handler.
146That function will simply call the common ftrace_return_to_handler function and 146That function will simply call the common ftrace_return_to_handler function and
147that will return the original return address with which, you can return to the 147that will return the original return address with which you can return to the
148original call site. 148original call site.
149 149
150Here is the updated mcount pseudo code: 150Here is the updated mcount pseudo code:
diff --git a/Documentation/trace/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 162effbfbdec..664e7386d89e 100644
--- a/Documentation/trace/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
@@ -44,7 +44,8 @@ Check for lost events.
44Usage 44Usage
45----- 45-----
46 46
47Make sure debugfs is mounted to /sys/kernel/debug. If not, (requires root privileges) 47Make sure debugfs is mounted to /sys/kernel/debug.
48If not (requires root privileges):
48$ mount -t debugfs debugfs /sys/kernel/debug 49$ mount -t debugfs debugfs /sys/kernel/debug
49 50
50Check that the driver you are about to trace is not loaded. 51Check that the driver you are about to trace is not loaded.
@@ -91,7 +92,7 @@ $ dmesg > dmesg.txt
91$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt 92$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
92and then send the .tar.gz file. The trace compresses considerably. Replace 93and then send the .tar.gz file. The trace compresses considerably. Replace
93"pciid" and "nick" with the PCI ID or model name of your piece of hardware 94"pciid" and "nick" with the PCI ID or model name of your piece of hardware
94under investigation and your nick name. 95under investigation and your nickname.
95 96
96 97
97How Mmiotrace Works 98How Mmiotrace Works
@@ -100,7 +101,7 @@ How Mmiotrace Works
100Access to hardware IO-memory is gained by mapping addresses from PCI bus by 101Access to hardware IO-memory is gained by mapping addresses from PCI bus by
101calling one of the ioremap_*() functions. Mmiotrace is hooked into the 102calling one of the ioremap_*() functions. Mmiotrace is hooked into the
102__ioremap() function and gets called whenever a mapping is created. Mapping is 103__ioremap() function and gets called whenever a mapping is created. Mapping is
103an event that is recorded into the trace log. Note, that ISA range mappings 104an event that is recorded into the trace log. Note that ISA range mappings
104are not caught, since the mapping always exists and is returned directly. 105are not caught, since the mapping always exists and is returned directly.
105 106
106MMIO accesses are recorded via page faults. Just before __ioremap() returns, 107MMIO accesses are recorded via page faults. Just before __ioremap() returns,
@@ -122,11 +123,11 @@ Trace Log Format
122---------------- 123----------------
123 124
124The raw log is text and easily filtered with e.g. grep and awk. One record is 125The raw log is text and easily filtered with e.g. grep and awk. One record is
125one line in the log. A record starts with a keyword, followed by keyword 126one line in the log. A record starts with a keyword, followed by keyword-
126dependant arguments. Arguments are separated by a space, or continue until the 127dependent arguments. Arguments are separated by a space, or continue until the
127end of line. The format for version 20070824 is as follows: 128end of line. The format for version 20070824 is as follows:
128 129
129Explanation Keyword Space separated arguments 130Explanation Keyword Space-separated arguments
130--------------------------------------------------------------------------- 131---------------------------------------------------------------------------
131 132
132read event R width, timestamp, map id, physical, value, PC, PID 133read event R width, timestamp, map id, physical, value, PC, PID
@@ -136,7 +137,7 @@ iounmap event UNMAP timestamp, map id, PC, PID
136marker MARK timestamp, text 137marker MARK timestamp, text
137version VERSION the string "20070824" 138version VERSION the string "20070824"
138info for reader LSPCI one line from lspci -v 139info for reader LSPCI one line from lspci -v
139PCI address map PCIDEV space separated /proc/bus/pci/devices data 140PCI address map PCIDEV space-separated /proc/bus/pci/devices data
140unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID 141unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID
141 142
142Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual 143Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt
index 5eb4e487e667..87bee3c129ba 100644
--- a/Documentation/trace/tracepoint-analysis.txt
+++ b/Documentation/trace/tracepoint-analysis.txt
@@ -10,8 +10,8 @@ Tracepoints (see Documentation/trace/tracepoints.txt) can be used without
10creating custom kernel modules to register probe functions using the event 10creating custom kernel modules to register probe functions using the event
11tracing infrastructure. 11tracing infrastructure.
12 12
13Simplistically, tracepoints will represent an important event that when can 13Simplistically, tracepoints represent important events that can be
14be taken in conjunction with other tracepoints to build a "Big Picture" of 14taken in conjunction with other tracepoints to build a "Big Picture" of
15what is going on within the system. There are a large number of methods for 15what is going on within the system. There are a large number of methods for
16gathering and interpreting these events. Lacking any current Best Practises, 16gathering and interpreting these events. Lacking any current Best Practises,
17this document describes some of the methods that can be used. 17this document describes some of the methods that can be used.
@@ -33,12 +33,12 @@ calling
33 33
34will give a fair indication of the number of events available. 34will give a fair indication of the number of events available.
35 35
362.2 PCL 362.2 PCL (Performance Counters for Linux)
37------- 37-------
38 38
39Discovery and enumeration of all counters and events, including tracepoints 39Discovery and enumeration of all counters and events, including tracepoints,
40are available with the perf tool. Getting a list of available events is a 40are available with the perf tool. Getting a list of available events is a
41simple case of 41simple case of:
42 42
43 $ perf list 2>&1 | grep Tracepoint 43 $ perf list 2>&1 | grep Tracepoint
44 ext4:ext4_free_inode [Tracepoint event] 44 ext4:ext4_free_inode [Tracepoint event]
@@ -49,19 +49,19 @@ simple case of
49 [ .... remaining output snipped .... ] 49 [ .... remaining output snipped .... ]
50 50
51 51
522. Enabling Events 523. Enabling Events
53================== 53==================
54 54
552.1 System-Wide Event Enabling 553.1 System-Wide Event Enabling
56------------------------------ 56------------------------------
57 57
58See Documentation/trace/events.txt for a proper description on how events 58See Documentation/trace/events.txt for a proper description on how events
59can be enabled system-wide. A short example of enabling all events related 59can be enabled system-wide. A short example of enabling all events related
60to page allocation would look something like 60to page allocation would look something like:
61 61
62 $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done 62 $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done
63 63
642.2 System-Wide Event Enabling with SystemTap 643.2 System-Wide Event Enabling with SystemTap
65--------------------------------------------- 65---------------------------------------------
66 66
67In SystemTap, tracepoints are accessible using the kernel.trace() function 67In SystemTap, tracepoints are accessible using the kernel.trace() function
@@ -86,7 +86,7 @@ were allocating the pages.
86 print_count() 86 print_count()
87 } 87 }
88 88
892.3 System-Wide Event Enabling with PCL 893.3 System-Wide Event Enabling with PCL
90--------------------------------------- 90---------------------------------------
91 91
92By specifying the -a switch and analysing sleep, the system-wide events 92By specifying the -a switch and analysing sleep, the system-wide events
@@ -107,16 +107,16 @@ for a duration of time can be examined.
107Similarly, one could execute a shell and exit it as desired to get a report 107Similarly, one could execute a shell and exit it as desired to get a report
108at that point. 108at that point.
109 109
1102.4 Local Event Enabling 1103.4 Local Event Enabling
111------------------------ 111------------------------
112 112
113Documentation/trace/ftrace.txt describes how to enable events on a per-thread 113Documentation/trace/ftrace.txt describes how to enable events on a per-thread
114basis using set_ftrace_pid. 114basis using set_ftrace_pid.
115 115
1162.5 Local Event Enablement with PCL 1163.5 Local Event Enablement with PCL
117----------------------------------- 117-----------------------------------
118 118
119Events can be activate and tracked for the duration of a process on a local 119Events can be activated and tracked for the duration of a process on a local
120basis using PCL such as follows. 120basis using PCL such as follows.
121 121
122 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ 122 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
@@ -131,18 +131,18 @@ basis using PCL such as follows.
131 131
132 0.973913387 seconds time elapsed 132 0.973913387 seconds time elapsed
133 133
1343. Event Filtering 1344. Event Filtering
135================== 135==================
136 136
137Documentation/trace/ftrace.txt covers in-depth how to filter events in 137Documentation/trace/ftrace.txt covers in-depth how to filter events in
138ftrace. Obviously using grep and awk of trace_pipe is an option as well 138ftrace. Obviously using grep and awk of trace_pipe is an option as well
139as any script reading trace_pipe. 139as any script reading trace_pipe.
140 140
1414. Analysing Event Variances with PCL 1415. Analysing Event Variances with PCL
142===================================== 142=====================================
143 143
144Any workload can exhibit variances between runs and it can be important 144Any workload can exhibit variances between runs and it can be important
145to know what the standard deviation in. By and large, this is left to the 145to know what the standard deviation is. By and large, this is left to the
146performance analyst to do it by hand. In the event that the discrete event 146performance analyst to do it by hand. In the event that the discrete event
147occurrences are useful to the performance analyst, then perf can be used. 147occurrences are useful to the performance analyst, then perf can be used.
148 148
@@ -166,7 +166,7 @@ In the event that some higher-level event is required that depends on some
166aggregation of discrete events, then a script would need to be developed. 166aggregation of discrete events, then a script would need to be developed.
167 167
168Using --repeat, it is also possible to view how events are fluctuating over 168Using --repeat, it is also possible to view how events are fluctuating over
169time on a system wide basis using -a and sleep. 169time on a system-wide basis using -a and sleep.
170 170
171 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ 171 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
172 -e kmem:mm_pagevec_free \ 172 -e kmem:mm_pagevec_free \
@@ -180,7 +180,7 @@ time on a system wide basis using -a and sleep.
180 180
181 1.002251757 seconds time elapsed ( +- 0.005% ) 181 1.002251757 seconds time elapsed ( +- 0.005% )
182 182
1835. Higher-Level Analysis with Helper Scripts 1836. Higher-Level Analysis with Helper Scripts
184============================================ 184============================================
185 185
186When events are enabled the events that are triggering can be read from 186When events are enabled the events that are triggering can be read from
@@ -190,11 +190,11 @@ be gathered on-line as appropriate. Examples of post-processing might include
190 190
191 o Reading information from /proc for the PID that triggered the event 191 o Reading information from /proc for the PID that triggered the event
192 o Deriving a higher-level event from a series of lower-level events. 192 o Deriving a higher-level event from a series of lower-level events.
193 o Calculate latencies between two events 193 o Calculating latencies between two events
194 194
195Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example 195Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example
196script that can read trace_pipe from STDIN or a copy of a trace. When used 196script that can read trace_pipe from STDIN or a copy of a trace. When used
197on-line, it can be interrupted once to generate a report without existing 197on-line, it can be interrupted once to generate a report without exiting
198and twice to exit. 198and twice to exit.
199 199
200Simplistically, the script just reads STDIN and counts up events but it 200Simplistically, the script just reads STDIN and counts up events but it
@@ -212,12 +212,12 @@ also can do more such as
212 processes, the parent process responsible for creating all the helpers 212 processes, the parent process responsible for creating all the helpers
213 can be identified 213 can be identified
214 214
2156. Lower-Level Analysis with PCL 2157. Lower-Level Analysis with PCL
216================================ 216================================
217 217
218There may also be a requirement to identify what functions with a program 218There may also be a requirement to identify what functions within a program
219were generating events within the kernel. To begin this sort of analysis, the 219were generating events within the kernel. To begin this sort of analysis, the
220data must be recorded. At the time of writing, this required root 220data must be recorded. At the time of writing, this required root:
221 221
222 $ perf record -c 1 \ 222 $ perf record -c 1 \
223 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ 223 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
@@ -253,11 +253,11 @@ perf report.
253 # (For more details, try: perf report --sort comm,dso,symbol) 253 # (For more details, try: perf report --sort comm,dso,symbol)
254 # 254 #
255 255
256According to this, the vast majority of events occured triggered on events 256According to this, the vast majority of events triggered on events
257within the VDSO. With simple binaries, this will often be the case so lets 257within the VDSO. With simple binaries, this will often be the case so let's
258take a slightly different example. In the course of writing this, it was 258take a slightly different example. In the course of writing this, it was
259noticed that X was generating an insane amount of page allocations so lets look 259noticed that X was generating an insane amount of page allocations so let's look
260at it 260at it:
261 261
262 $ perf record -c 1 -f \ 262 $ perf record -c 1 -f \
263 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ 263 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
@@ -280,8 +280,8 @@ This was interrupted after a few seconds and
280 # (For more details, try: perf report --sort comm,dso,symbol) 280 # (For more details, try: perf report --sort comm,dso,symbol)
281 # 281 #
282 282
283So, almost half of the events are occuring in a library. To get an idea which 283So, almost half of the events are occurring in a library. To get an idea which
284symbol. 284symbol:
285 285
286 $ perf report --sort comm,dso,symbol 286 $ perf report --sort comm,dso,symbol
287 # Samples: 27666 287 # Samples: 27666
@@ -297,7 +297,7 @@ symbol.
297 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path 297 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path
298 0.00% Xorg [kernel] [k] ftrace_trace_userstack 298 0.00% Xorg [kernel] [k] ftrace_trace_userstack
299 299
300To see where within the function pixmanFillsse2 things are going wrong 300To see where within the function pixmanFillsse2 things are going wrong:
301 301
302 $ perf annotate pixmanFillsse2 302 $ perf annotate pixmanFillsse2
303 [ ... ] 303 [ ... ]
diff --git a/Documentation/vgaarbiter.txt b/Documentation/vgaarbiter.txt
index 987f9b0a5ece..43a9b0694fdd 100644
--- a/Documentation/vgaarbiter.txt
+++ b/Documentation/vgaarbiter.txt
@@ -103,7 +103,7 @@ I.2 libpciaccess
103---------------- 103----------------
104 104
105To use the vga arbiter char device it was implemented an API inside the 105To use the vga arbiter char device it was implemented an API inside the
106libpciaccess library. One fieldd was added to struct pci_device (each device 106libpciaccess library. One field was added to struct pci_device (each device
107on the system): 107on the system):
108 108
109 /* the type of resource decoded by the device */ 109 /* the type of resource decoded by the device */