aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-block59
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-cciss33
-rw-r--r--Documentation/ABI/testing/sysfs-devices-cache_disable18
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab479
-rw-r--r--Documentation/Changes19
-rw-r--r--Documentation/CodingStyle4
-rw-r--r--Documentation/DMA-API.txt16
-rw-r--r--Documentation/DocBook/Makefile3
-rw-r--r--Documentation/DocBook/debugobjects.tmpl2
-rw-r--r--Documentation/DocBook/mac80211.tmpl1
-rw-r--r--Documentation/DocBook/tracepoint.tmpl89
-rw-r--r--Documentation/RCU/rculist_nulls.txt2
-rw-r--r--Documentation/RCU/trace.txt102
-rw-r--r--Documentation/SM501.txt2
-rw-r--r--Documentation/Smack.txt20
-rw-r--r--Documentation/SubmittingPatches82
-rw-r--r--Documentation/accounting/getdelays.c3
-rw-r--r--Documentation/arm/Samsung-S3C24XX/GPIO.txt10
-rw-r--r--Documentation/atomic_ops.txt4
-rw-r--r--Documentation/block/biodoc.txt2
-rw-r--r--Documentation/block/deadline-iosched.txt2
-rw-r--r--Documentation/braille-console.txt2
-rw-r--r--Documentation/cdrom/packet-writing.txt2
-rw-r--r--Documentation/dell_rbu.txt4
-rw-r--r--Documentation/development-process/5.Posting31
-rw-r--r--Documentation/driver-model/device.txt32
-rw-r--r--Documentation/driver-model/devres.txt2
-rw-r--r--Documentation/dvb/get_dvb_firmware8
-rw-r--r--Documentation/edac.txt8
-rw-r--r--Documentation/fault-injection/fault-injection.txt70
-rw-r--r--Documentation/fb/sh7760fb.txt2
-rw-r--r--Documentation/fb/vesafb.txt2
-rw-r--r--Documentation/feature-removal-schedule.txt17
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt2
-rw-r--r--Documentation/filesystems/debugfs.txt158
-rw-r--r--Documentation/filesystems/ext4.txt6
-rw-r--r--Documentation/filesystems/fiemap.txt2
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt2
-rw-r--r--Documentation/filesystems/gfs2.txt19
-rw-r--r--Documentation/filesystems/nfs-rdma.txt2
-rw-r--r--Documentation/filesystems/nilfs2.txt5
-rw-r--r--Documentation/filesystems/proc.txt19
-rw-r--r--Documentation/filesystems/sysfs-pci.txt2
-rw-r--r--Documentation/filesystems/tmpfs.txt2
-rw-r--r--Documentation/filesystems/vfat.txt13
-rw-r--r--Documentation/firmware_class/README3
-rw-r--r--Documentation/futex-requeue-pi.txt131
-rw-r--r--Documentation/gpio.txt2
-rw-r--r--Documentation/hwmon/f71882fg12
-rw-r--r--Documentation/hwmon/ibmaem2
-rw-r--r--Documentation/hwmon/sysfs-interface25
-rw-r--r--Documentation/hwmon/tmp40142
-rw-r--r--Documentation/hwmon/w83627ehf11
-rw-r--r--Documentation/i2c/busses/i2c-ocores17
-rw-r--r--Documentation/i2c/busses/i2c-viapro4
-rw-r--r--Documentation/ide/ide.txt2
-rw-r--r--Documentation/input/multi-touch-protocol.txt103
-rw-r--r--Documentation/isdn/00-INDEX29
-rw-r--r--Documentation/isdn/INTERFACE.CAPI94
-rw-r--r--Documentation/isdn/README.gigaset42
-rw-r--r--Documentation/kbuild/kconfig.txt116
-rw-r--r--Documentation/kbuild/modules.txt2
-rw-r--r--Documentation/kdump/kdump.txt4
-rw-r--r--Documentation/kernel-parameters.txt86
-rw-r--r--Documentation/kmemcheck.txt773
-rw-r--r--Documentation/kmemleak.txt142
-rw-r--r--Documentation/kobject.txt2
-rw-r--r--Documentation/kprobes.txt6
-rw-r--r--Documentation/laptops/acer-wmi.txt2
-rw-r--r--Documentation/laptops/sony-laptop.txt2
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt2
-rw-r--r--Documentation/lguest/Makefile3
-rw-r--r--Documentation/lguest/lguest.c1008
-rw-r--r--Documentation/lguest/lguest.txt1
-rw-r--r--Documentation/local_ops.txt2
-rw-r--r--Documentation/memory-barriers.txt129
-rw-r--r--Documentation/memory-hotplug.txt8
-rw-r--r--Documentation/mn10300/ABI.txt2
-rw-r--r--Documentation/mtd/nand_ecc.txt12
-rw-r--r--Documentation/networking/bonding.txt6
-rw-r--r--Documentation/networking/can.txt237
-rw-r--r--Documentation/networking/dm9000.txt2
-rw-r--r--Documentation/networking/ieee802154.txt76
-rw-r--r--Documentation/networking/ip-sysctl.txt33
-rw-r--r--Documentation/networking/ipv6.txt37
-rw-r--r--Documentation/networking/l2tp.txt2
-rw-r--r--Documentation/networking/mac80211-injection.txt28
-rw-r--r--Documentation/networking/netdevices.txt2
-rw-r--r--Documentation/networking/operstates.txt3
-rw-r--r--Documentation/networking/packet_mmap.txt140
-rw-r--r--Documentation/networking/phonet.txt2
-rw-r--r--Documentation/networking/regulatory.txt2
-rw-r--r--Documentation/power/devices.txt34
-rw-r--r--Documentation/power/regulator/consumer.txt2
-rw-r--r--Documentation/power/regulator/overview.txt2
-rw-r--r--Documentation/power/s2ram.txt2
-rw-r--r--Documentation/power/userland-swsusp.txt2
-rw-r--r--Documentation/powerpc/booting-without-of.txt4
-rw-r--r--Documentation/powerpc/dts-bindings/can/sja1000.txt53
-rw-r--r--Documentation/powerpc/dts-bindings/ecm.txt64
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/board.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt3
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/esdhc.txt5
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/mcm.txt64
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/msi-pic.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/pmc.txt4
-rw-r--r--Documentation/powerpc/qe_firmware.txt2
-rw-r--r--Documentation/rbtree.txt10
-rw-r--r--Documentation/rfkill.txt607
-rw-r--r--Documentation/s390/Debugging390.txt4
-rw-r--r--Documentation/scheduler/sched-nice-design.txt2
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/scsi/aic79xx.txt2
-rw-r--r--Documentation/scsi/ncr53c8xx.txt4
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt2
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt38
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt19
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt2
-rw-r--r--Documentation/sound/alsa/Procfile.txt39
-rw-r--r--Documentation/sound/alsa/README.maya44163
-rw-r--r--Documentation/sound/alsa/hda_codec.txt2
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt1
-rw-r--r--Documentation/sysctl/kernel.txt11
-rw-r--r--Documentation/sysctl/vm.txt27
-rw-r--r--Documentation/timers/hpet.txt2
-rw-r--r--Documentation/timers/timer_stats.txt2
-rw-r--r--Documentation/trace/events.txt90
-rw-r--r--Documentation/trace/ftrace.txt252
-rw-r--r--Documentation/trace/kmemtrace.txt2
-rw-r--r--Documentation/trace/mmiotrace.txt26
-rw-r--r--Documentation/trace/power.txt17
-rw-r--r--Documentation/usb/WUSB-Design-overview.txt8
-rw-r--r--Documentation/usb/anchors.txt4
-rw-r--r--Documentation/usb/callbacks.txt2
-rw-r--r--Documentation/video4linux/CARDLIST.cx238855
-rw-r--r--Documentation/video4linux/CARDLIST.cx882
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx6
-rw-r--r--Documentation/video4linux/CARDLIST.saa713422
-rw-r--r--Documentation/video4linux/CARDLIST.tuner2
-rw-r--r--Documentation/video4linux/cx18.txt2
-rw-r--r--Documentation/video4linux/gspca.txt12
-rw-r--r--Documentation/video4linux/pxa_camera.txt49
-rw-r--r--Documentation/video4linux/v4l2-framework.txt5
-rw-r--r--Documentation/vm/Makefile2
-rw-r--r--Documentation/vm/balance18
-rw-r--r--Documentation/vm/page-types.c698
-rw-r--r--Documentation/vm/pagemap.txt68
-rw-r--r--Documentation/x86/boot.txt122
-rw-r--r--Documentation/x86/x86_64/boot-options.txt49
-rw-r--r--Documentation/x86/x86_64/machinecheck8
-rw-r--r--Documentation/x86/x86_64/mm.txt9
155 files changed, 5613 insertions, 1832 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 44f52a4f5903..cbbd3e069945 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -60,3 +60,62 @@ Description:
60 Indicates whether the block layer should automatically 60 Indicates whether the block layer should automatically
61 generate checksums for write requests bound for 61 generate checksums for write requests bound for
62 devices that support receiving integrity metadata. 62 devices that support receiving integrity metadata.
63
64What: /sys/block/<disk>/alignment_offset
65Date: April 2009
66Contact: Martin K. Petersen <martin.petersen@oracle.com>
67Description:
68 Storage devices may report a physical block size that is
69 bigger than the logical block size (for instance a drive
70 with 4KB physical sectors exposing 512-byte logical
71 blocks to the operating system). This parameter
72 indicates how many bytes the beginning of the device is
73 offset from the disk's natural alignment.
74
75What: /sys/block/<disk>/<partition>/alignment_offset
76Date: April 2009
77Contact: Martin K. Petersen <martin.petersen@oracle.com>
78Description:
79 Storage devices may report a physical block size that is
80 bigger than the logical block size (for instance a drive
81 with 4KB physical sectors exposing 512-byte logical
82 blocks to the operating system). This parameter
83 indicates how many bytes the beginning of the partition
84 is offset from the disk's natural alignment.
85
86What: /sys/block/<disk>/queue/logical_block_size
87Date: May 2009
88Contact: Martin K. Petersen <martin.petersen@oracle.com>
89Description:
90 This is the smallest unit the storage device can
91 address. It is typically 512 bytes.
92
93What: /sys/block/<disk>/queue/physical_block_size
94Date: May 2009
95Contact: Martin K. Petersen <martin.petersen@oracle.com>
96Description:
97 This is the smallest unit the storage device can write
98 without resorting to read-modify-write operation. It is
99 usually the same as the logical block size but may be
100 bigger. One example is SATA drives with 4KB sectors
101 that expose a 512-byte logical block size to the
102 operating system.
103
104What: /sys/block/<disk>/queue/minimum_io_size
105Date: April 2009
106Contact: Martin K. Petersen <martin.petersen@oracle.com>
107Description:
108 Storage devices may report a preferred minimum I/O size,
109 which is the smallest request the device can perform
110 without incurring a read-modify-write penalty. For disk
111 drives this is often the physical block size. For RAID
112 arrays it is often the stripe chunk size.
113
114What: /sys/block/<disk>/queue/optimal_io_size
115Date: April 2009
116Contact: Martin K. Petersen <martin.petersen@oracle.com>
117Description:
118 Storage devices may report an optimal I/O size, which is
119 the device's preferred unit of receiving I/O. This is
120 rarely reported for disk drives. For RAID devices it is
121 usually the stripe width or the internal block size.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
new file mode 100644
index 000000000000..0a92a7c93a62
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -0,0 +1,33 @@
1Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/model
2Date: March 2009
3Kernel Version: 2.6.30
4Contact: iss_storagedev@hp.com
5Description: Displays the SCSI INQUIRY page 0 model for logical drive
6 Y of controller X.
7
8Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
9Date: March 2009
10Kernel Version: 2.6.30
11Contact: iss_storagedev@hp.com
12Description: Displays the SCSI INQUIRY page 0 revision for logical
13 drive Y of controller X.
14
15Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
16Date: March 2009
17Kernel Version: 2.6.30
18Contact: iss_storagedev@hp.com
19Description: Displays the SCSI INQUIRY page 83 serial number for logical
20 drive Y of controller X.
21
22Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
23Date: March 2009
24Kernel Version: 2.6.30
25Contact: iss_storagedev@hp.com
26Description: Displays the SCSI INQUIRY page 0 vendor for logical drive
27 Y of controller X.
28
29Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
30Date: March 2009
31Kernel Version: 2.6.30
32Contact: iss_storagedev@hp.com
33Description: A symbolic link to /sys/block/cciss!cXdY
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644
index 000000000000..175bb4f70512
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
1What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
2Date: August 2008
3KernelVersion: 2.6.27
4Contact: mark.langsdorf@amd.com
5Description: These files exist in every cpu's cache index directories.
6 There are currently 2 cache_disable_# files in each
7 directory. Reading from these files on a supported
8 processor will return that cache disable index value
9 for that processor and node. Writing to one of these
10 files will cause the specificed cache index to be disabled.
11
12 Currently, only AMD Family 10h Processors support cache index
13 disable, and only for their L3 caches. See the BIOS and
14 Kernel Developer's Guide at
15 http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
16 for formatting information and other details on the
17 cache index disable.
18Users: joachim.deguara@amd.com
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
new file mode 100644
index 000000000000..6dcf75e594fb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -0,0 +1,479 @@
1What: /sys/kernel/slab
2Date: May 2007
3KernelVersion: 2.6.22
4Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
5 Christoph Lameter <cl@linux-foundation.org>
6Description:
7 The /sys/kernel/slab directory contains a snapshot of the
8 internal state of the SLUB allocator for each cache. Certain
9 files may be modified to change the behavior of the cache (and
10 any cache it aliases, if any).
11Users: kernel memory tuning tools
12
13What: /sys/kernel/slab/cache/aliases
14Date: May 2007
15KernelVersion: 2.6.22
16Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
17 Christoph Lameter <cl@linux-foundation.org>
18Description:
19 The aliases file is read-only and specifies how many caches
20 have merged into this cache.
21
22What: /sys/kernel/slab/cache/align
23Date: May 2007
24KernelVersion: 2.6.22
25Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
26 Christoph Lameter <cl@linux-foundation.org>
27Description:
28 The align file is read-only and specifies the cache's object
29 alignment in bytes.
30
31What: /sys/kernel/slab/cache/alloc_calls
32Date: May 2007
33KernelVersion: 2.6.22
34Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
35 Christoph Lameter <cl@linux-foundation.org>
36Description:
37 The alloc_calls file is read-only and lists the kernel code
38 locations from which allocations for this cache were performed.
39 The alloc_calls file only contains information if debugging is
40 enabled for that cache (see Documentation/vm/slub.txt).
41
42What: /sys/kernel/slab/cache/alloc_fastpath
43Date: February 2008
44KernelVersion: 2.6.25
45Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
46 Christoph Lameter <cl@linux-foundation.org>
47Description:
48 The alloc_fastpath file is read-only and specifies how many
49 objects have been allocated using the fast path.
50 Available when CONFIG_SLUB_STATS is enabled.
51
52What: /sys/kernel/slab/cache/alloc_from_partial
53Date: February 2008
54KernelVersion: 2.6.25
55Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
56 Christoph Lameter <cl@linux-foundation.org>
57Description:
58 The alloc_from_partial file is read-only and specifies how
59 many times a cpu slab has been full and it has been refilled
60 by using a slab from the list of partially used slabs.
61 Available when CONFIG_SLUB_STATS is enabled.
62
63What: /sys/kernel/slab/cache/alloc_refill
64Date: February 2008
65KernelVersion: 2.6.25
66Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
67 Christoph Lameter <cl@linux-foundation.org>
68Description:
69 The alloc_refill file is read-only and specifies how many
70 times the per-cpu freelist was empty but there were objects
71 available as the result of remote cpu frees.
72 Available when CONFIG_SLUB_STATS is enabled.
73
74What: /sys/kernel/slab/cache/alloc_slab
75Date: February 2008
76KernelVersion: 2.6.25
77Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
78 Christoph Lameter <cl@linux-foundation.org>
79Description:
80 The alloc_slab file is read-only and specifies how many times
81 a new slab had to be allocated from the page allocator.
82 Available when CONFIG_SLUB_STATS is enabled.
83
84What: /sys/kernel/slab/cache/alloc_slowpath
85Date: February 2008
86KernelVersion: 2.6.25
87Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
88 Christoph Lameter <cl@linux-foundation.org>
89Description:
90 The alloc_slowpath file is read-only and specifies how many
91 objects have been allocated using the slow path because of a
92 refill or allocation from a partial or new slab.
93 Available when CONFIG_SLUB_STATS is enabled.
94
95What: /sys/kernel/slab/cache/cache_dma
96Date: May 2007
97KernelVersion: 2.6.22
98Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
99 Christoph Lameter <cl@linux-foundation.org>
100Description:
101 The cache_dma file is read-only and specifies whether objects
102 are from ZONE_DMA.
103 Available when CONFIG_ZONE_DMA is enabled.
104
105What: /sys/kernel/slab/cache/cpu_slabs
106Date: May 2007
107KernelVersion: 2.6.22
108Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
109 Christoph Lameter <cl@linux-foundation.org>
110Description:
111 The cpu_slabs file is read-only and displays how many cpu slabs
112 are active and their NUMA locality.
113
114What: /sys/kernel/slab/cache/cpuslab_flush
115Date: April 2009
116KernelVersion: 2.6.31
117Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
118 Christoph Lameter <cl@linux-foundation.org>
119Description:
120 The file cpuslab_flush is read-only and specifies how many
121 times a cache's cpu slabs have been flushed as the result of
122 destroying or shrinking a cache, a cpu going offline, or as
123 the result of forcing an allocation from a certain node.
124 Available when CONFIG_SLUB_STATS is enabled.
125
126What: /sys/kernel/slab/cache/ctor
127Date: May 2007
128KernelVersion: 2.6.22
129Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
130 Christoph Lameter <cl@linux-foundation.org>
131Description:
132 The ctor file is read-only and specifies the cache's object
133 constructor function, which is invoked for each object when a
134 new slab is allocated.
135
136What: /sys/kernel/slab/cache/deactivate_empty
137Date: February 2008
138KernelVersion: 2.6.25
139Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
140 Christoph Lameter <cl@linux-foundation.org>
141Description:
142 The file deactivate_empty is read-only and specifies how many
143 times an empty cpu slab was deactivated.
144 Available when CONFIG_SLUB_STATS is enabled.
145
146What: /sys/kernel/slab/cache/deactivate_full
147Date: February 2008
148KernelVersion: 2.6.25
149Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
150 Christoph Lameter <cl@linux-foundation.org>
151Description:
152 The file deactivate_full is read-only and specifies how many
153 times a full cpu slab was deactivated.
154 Available when CONFIG_SLUB_STATS is enabled.
155
156What: /sys/kernel/slab/cache/deactivate_remote_frees
157Date: February 2008
158KernelVersion: 2.6.25
159Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
160 Christoph Lameter <cl@linux-foundation.org>
161Description:
162 The file deactivate_remote_frees is read-only and specifies how
163 many times a cpu slab has been deactivated and contained free
164 objects that were freed remotely.
165 Available when CONFIG_SLUB_STATS is enabled.
166
167What: /sys/kernel/slab/cache/deactivate_to_head
168Date: February 2008
169KernelVersion: 2.6.25
170Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
171 Christoph Lameter <cl@linux-foundation.org>
172Description:
173 The file deactivate_to_head is read-only and specifies how
174 many times a partial cpu slab was deactivated and added to the
175 head of its node's partial list.
176 Available when CONFIG_SLUB_STATS is enabled.
177
178What: /sys/kernel/slab/cache/deactivate_to_tail
179Date: February 2008
180KernelVersion: 2.6.25
181Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
182 Christoph Lameter <cl@linux-foundation.org>
183Description:
184 The file deactivate_to_tail is read-only and specifies how
185 many times a partial cpu slab was deactivated and added to the
186 tail of its node's partial list.
187 Available when CONFIG_SLUB_STATS is enabled.
188
189What: /sys/kernel/slab/cache/destroy_by_rcu
190Date: May 2007
191KernelVersion: 2.6.22
192Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
193 Christoph Lameter <cl@linux-foundation.org>
194Description:
195 The destroy_by_rcu file is read-only and specifies whether
196 slabs (not objects) are freed by rcu.
197
198What: /sys/kernel/slab/cache/free_add_partial
199Date: February 2008
200KernelVersion: 2.6.25
201Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
202 Christoph Lameter <cl@linux-foundation.org>
203Description:
204 The file free_add_partial is read-only and specifies how many
205 times an object has been freed in a full slab so that it had to
206 added to its node's partial list.
207 Available when CONFIG_SLUB_STATS is enabled.
208
209What: /sys/kernel/slab/cache/free_calls
210Date: May 2007
211KernelVersion: 2.6.22
212Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
213 Christoph Lameter <cl@linux-foundation.org>
214Description:
215 The free_calls file is read-only and lists the locations of
216 object frees if slab debugging is enabled (see
217 Documentation/vm/slub.txt).
218
219What: /sys/kernel/slab/cache/free_fastpath
220Date: February 2008
221KernelVersion: 2.6.25
222Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
223 Christoph Lameter <cl@linux-foundation.org>
224Description:
225 The free_fastpath file is read-only and specifies how many
226 objects have been freed using the fast path because it was an
227 object from the cpu slab.
228 Available when CONFIG_SLUB_STATS is enabled.
229
230What: /sys/kernel/slab/cache/free_frozen
231Date: February 2008
232KernelVersion: 2.6.25
233Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
234 Christoph Lameter <cl@linux-foundation.org>
235Description:
236 The free_frozen file is read-only and specifies how many
237 objects have been freed to a frozen slab (i.e. a remote cpu
238 slab).
239 Available when CONFIG_SLUB_STATS is enabled.
240
241What: /sys/kernel/slab/cache/free_remove_partial
242Date: February 2008
243KernelVersion: 2.6.25
244Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
245 Christoph Lameter <cl@linux-foundation.org>
246Description:
247 The file free_remove_partial is read-only and specifies how
248 many times an object has been freed to a now-empty slab so
249 that it had to be removed from its node's partial list.
250 Available when CONFIG_SLUB_STATS is enabled.
251
252What: /sys/kernel/slab/cache/free_slab
253Date: February 2008
254KernelVersion: 2.6.25
255Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
256 Christoph Lameter <cl@linux-foundation.org>
257Description:
258 The free_slab file is read-only and specifies how many times an
259 empty slab has been freed back to the page allocator.
260 Available when CONFIG_SLUB_STATS is enabled.
261
262What: /sys/kernel/slab/cache/free_slowpath
263Date: February 2008
264KernelVersion: 2.6.25
265Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
266 Christoph Lameter <cl@linux-foundation.org>
267Description:
268 The free_slowpath file is read-only and specifies how many
269 objects have been freed using the slow path (i.e. to a full or
270 partial slab).
271 Available when CONFIG_SLUB_STATS is enabled.
272
273What: /sys/kernel/slab/cache/hwcache_align
274Date: May 2007
275KernelVersion: 2.6.22
276Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
277 Christoph Lameter <cl@linux-foundation.org>
278Description:
279 The hwcache_align file is read-only and specifies whether
280 objects are aligned on cachelines.
281
282What: /sys/kernel/slab/cache/min_partial
283Date: February 2009
284KernelVersion: 2.6.30
285Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
286 David Rientjes <rientjes@google.com>
287Description:
288 The min_partial file specifies how many empty slabs shall
289 remain on a node's partial list to avoid the overhead of
290 allocating new slabs. Such slabs may be reclaimed by utilizing
291 the shrink file.
292
293What: /sys/kernel/slab/cache/object_size
294Date: May 2007
295KernelVersion: 2.6.22
296Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
297 Christoph Lameter <cl@linux-foundation.org>
298Description:
299 The object_size file is read-only and specifies the cache's
300 object size.
301
302What: /sys/kernel/slab/cache/objects
303Date: May 2007
304KernelVersion: 2.6.22
305Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
306 Christoph Lameter <cl@linux-foundation.org>
307Description:
308 The objects file is read-only and displays how many objects are
309 active and from which nodes they are from.
310
311What: /sys/kernel/slab/cache/objects_partial
312Date: April 2008
313KernelVersion: 2.6.26
314Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
315 Christoph Lameter <cl@linux-foundation.org>
316Description:
317 The objects_partial file is read-only and displays how many
318 objects are on partial slabs and from which nodes they are
319 from.
320
321What: /sys/kernel/slab/cache/objs_per_slab
322Date: May 2007
323KernelVersion: 2.6.22
324Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
325 Christoph Lameter <cl@linux-foundation.org>
326Description:
327 The file objs_per_slab is read-only and specifies how many
328 objects may be allocated from a single slab of the order
329 specified in /sys/kernel/slab/cache/order.
330
331What: /sys/kernel/slab/cache/order
332Date: May 2007
333KernelVersion: 2.6.22
334Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
335 Christoph Lameter <cl@linux-foundation.org>
336Description:
337 The order file specifies the page order at which new slabs are
338 allocated. It is writable and can be changed to increase the
339 number of objects per slab. If a slab cannot be allocated
340 because of fragmentation, SLUB will retry with the minimum order
341 possible depending on its characteristics.
342
343What: /sys/kernel/slab/cache/order_fallback
344Date: April 2008
345KernelVersion: 2.6.26
346Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
347 Christoph Lameter <cl@linux-foundation.org>
348Description:
349 The file order_fallback is read-only and specifies how many
350 times an allocation of a new slab has not been possible at the
351 cache's order and instead fallen back to its minimum possible
352 order.
353 Available when CONFIG_SLUB_STATS is enabled.
354
355What: /sys/kernel/slab/cache/partial
356Date: May 2007
357KernelVersion: 2.6.22
358Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
359 Christoph Lameter <cl@linux-foundation.org>
360Description:
361 The partial file is read-only and displays how long many
362 partial slabs there are and how long each node's list is.
363
364What: /sys/kernel/slab/cache/poison
365Date: May 2007
366KernelVersion: 2.6.22
367Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
368 Christoph Lameter <cl@linux-foundation.org>
369Description:
370 The poison file specifies whether objects should be poisoned
371 when a new slab is allocated.
372
373What: /sys/kernel/slab/cache/reclaim_account
374Date: May 2007
375KernelVersion: 2.6.22
376Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
377 Christoph Lameter <cl@linux-foundation.org>
378Description:
379 The reclaim_account file specifies whether the cache's objects
380 are reclaimable (and grouped by their mobility).
381
382What: /sys/kernel/slab/cache/red_zone
383Date: May 2007
384KernelVersion: 2.6.22
385Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
386 Christoph Lameter <cl@linux-foundation.org>
387Description:
388 The red_zone file specifies whether the cache's objects are red
389 zoned.
390
391What: /sys/kernel/slab/cache/remote_node_defrag_ratio
392Date: January 2008
393KernelVersion: 2.6.25
394Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
395 Christoph Lameter <cl@linux-foundation.org>
396Description:
397 The file remote_node_defrag_ratio specifies the percentage of
398 times SLUB will attempt to refill the cpu slab with a partial
399 slab from a remote node as opposed to allocating a new slab on
400 the local node. This reduces the amount of wasted memory over
401 the entire system but can be expensive.
402 Available when CONFIG_NUMA is enabled.
403
404What: /sys/kernel/slab/cache/sanity_checks
405Date: May 2007
406KernelVersion: 2.6.22
407Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
408 Christoph Lameter <cl@linux-foundation.org>
409Description:
410 The sanity_checks file specifies whether expensive checks
411 should be performed on free and, at minimum, enables double free
412 checks. Caches that enable sanity_checks cannot be merged with
413 caches that do not.
414
415What: /sys/kernel/slab/cache/shrink
416Date: May 2007
417KernelVersion: 2.6.22
418Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
419 Christoph Lameter <cl@linux-foundation.org>
420Description:
421 The shrink file is written when memory should be reclaimed from
422 a cache. Empty partial slabs are freed and the partial list is
423 sorted so the slabs with the fewest available objects are used
424 first.
425
426What: /sys/kernel/slab/cache/slab_size
427Date: May 2007
428KernelVersion: 2.6.22
429Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
430 Christoph Lameter <cl@linux-foundation.org>
431Description:
432 The slab_size file is read-only and specifies the object size
433 with metadata (debugging information and alignment) in bytes.
434
435What: /sys/kernel/slab/cache/slabs
436Date: May 2007
437KernelVersion: 2.6.22
438Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
439 Christoph Lameter <cl@linux-foundation.org>
440Description:
441 The slabs file is read-only and displays how long many slabs
442 there are (both cpu and partial) and from which nodes they are
443 from.
444
445What: /sys/kernel/slab/cache/store_user
446Date: May 2007
447KernelVersion: 2.6.22
448Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
449 Christoph Lameter <cl@linux-foundation.org>
450Description:
451 The store_user file specifies whether the location of
452 allocation or free should be tracked for a cache.
453
454What: /sys/kernel/slab/cache/total_objects
455Date: April 2008
456KernelVersion: 2.6.26
457Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
458 Christoph Lameter <cl@linux-foundation.org>
459Description:
460 The total_objects file is read-only and displays how many total
461 objects a cache has and from which nodes they are from.
462
463What: /sys/kernel/slab/cache/trace
464Date: May 2007
465KernelVersion: 2.6.22
466Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
467 Christoph Lameter <cl@linux-foundation.org>
468Description:
469 The trace file specifies whether object allocations and frees
470 should be traced.
471
472What: /sys/kernel/slab/cache/validate
473Date: May 2007
474KernelVersion: 2.6.22
475Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
476 Christoph Lameter <cl@linux-foundation.org>
477Description:
478 Writing to the validate file causes SLUB to traverse all of its
479 cache's objects and check the validity of metadata.
diff --git a/Documentation/Changes b/Documentation/Changes
index b95082be4d5e..664392481c84 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -29,7 +29,7 @@ hardware, for example, you probably needn't concern yourself with
29isdn4k-utils. 29isdn4k-utils.
30 30
31o Gnu C 3.2 # gcc --version 31o Gnu C 3.2 # gcc --version
32o Gnu make 3.79.1 # make --version 32o Gnu make 3.80 # make --version
33o binutils 2.12 # ld -v 33o binutils 2.12 # ld -v
34o util-linux 2.10o # fdformat --version 34o util-linux 2.10o # fdformat --version
35o module-init-tools 0.9.10 # depmod -V 35o module-init-tools 0.9.10 # depmod -V
@@ -48,6 +48,7 @@ o procps 3.2.0 # ps --version
48o oprofile 0.9 # oprofiled --version 48o oprofile 0.9 # oprofiled --version
49o udev 081 # udevinfo -V 49o udev 081 # udevinfo -V
50o grub 0.93 # grub --version 50o grub 0.93 # grub --version
51o mcelog 0.6
51 52
52Kernel compilation 53Kernel compilation
53================== 54==================
@@ -61,7 +62,7 @@ computer.
61Make 62Make
62---- 63----
63 64
64You will need Gnu make 3.79.1 or later to build the kernel. 65You will need Gnu make 3.80 or later to build the kernel.
65 66
66Binutils 67Binutils
67-------- 68--------
@@ -276,6 +277,16 @@ before running exportfs or mountd. It is recommended that all NFS
276services be protected from the internet-at-large by a firewall where 277services be protected from the internet-at-large by a firewall where
277that is possible. 278that is possible.
278 279
280mcelog
281------
282
283In Linux 2.6.31+ the i386 kernel needs to run the mcelog utility
284as a regular cronjob similar to the x86-64 kernel to process and log
285machine check events when CONFIG_X86_NEW_MCE is enabled. Machine check
286events are errors reported by the CPU. Processing them is strongly encouraged.
287All x86-64 kernels since 2.6.4 require the mcelog utility to
288process machine checks.
289
279Getting updated software 290Getting updated software
280======================== 291========================
281 292
@@ -365,6 +376,10 @@ FUSE
365---- 376----
366o <http://sourceforge.net/projects/fuse> 377o <http://sourceforge.net/projects/fuse>
367 378
379mcelog
380------
381o <ftp://ftp.kernel.org/pub/linux/utils/cpu/mce/mcelog/>
382
368Networking 383Networking
369********** 384**********
370 385
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 72968cd5eaf3..8bb37237ebd2 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -698,8 +698,8 @@ very often is not. Abundant use of the inline keyword leads to a much bigger
698kernel, which in turn slows the system as a whole down, due to a bigger 698kernel, which in turn slows the system as a whole down, due to a bigger
699icache footprint for the CPU and simply because there is less memory 699icache footprint for the CPU and simply because there is less memory
700available for the pagecache. Just think about it; a pagecache miss causes a 700available for the pagecache. Just think about it; a pagecache miss causes a
701disk seek, which easily takes 5 miliseconds. There are a LOT of cpu cycles 701disk seek, which easily takes 5 milliseconds. There are a LOT of cpu cycles
702that can go into these 5 miliseconds. 702that can go into these 5 milliseconds.
703 703
704A reasonable rule of thumb is to not put inline at functions that have more 704A reasonable rule of thumb is to not put inline at functions that have more
705than 3 lines of code in them. An exception to this rule are the cases where 705than 3 lines of code in them. An exception to this rule are the cases where
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d9aa43d78bcc..5aceb88b3f8b 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -676,8 +676,8 @@ this directory the following files can currently be found:
676 dma-api/all_errors This file contains a numeric value. If this 676 dma-api/all_errors This file contains a numeric value. If this
677 value is not equal to zero the debugging code 677 value is not equal to zero the debugging code
678 will print a warning for every error it finds 678 will print a warning for every error it finds
679 into the kernel log. Be carefull with this 679 into the kernel log. Be careful with this
680 option. It can easily flood your logs. 680 option, as it can easily flood your logs.
681 681
682 dma-api/disabled This read-only file contains the character 'Y' 682 dma-api/disabled This read-only file contains the character 'Y'
683 if the debugging code is disabled. This can 683 if the debugging code is disabled. This can
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
704 The current number of free dma_debug_entries 704 The current number of free dma_debug_entries
705 in the allocator. 705 in the allocator.
706 706
707 dma-api/driver-filter
708 You can write a name of a driver into this file
709 to limit the debug output to requests from that
710 particular driver. Write an empty string to
711 that file to disable the filter and see
712 all errors again.
713
707If you have this code compiled into your kernel it will be enabled by default. 714If you have this code compiled into your kernel it will be enabled by default.
708If you want to boot without the bookkeeping anyway you can provide 715If you want to boot without the bookkeeping anyway you can provide
709'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. 716'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
710Notice that you can not enable it again at runtime. You have to reboot to do 717Notice that you can not enable it again at runtime. You have to reboot to do
711so. 718so.
712 719
720If you want to see debug messages only for a special device driver you can
721specify the dma_debug_driver=<drivername> parameter. This will enable the
722driver filter at boot time. The debug code will only print errors for that
723driver afterwards. This filter can be disabled or changed later using debugfs.
724
713When the code disables itself at runtime this is most likely because it ran 725When the code disables itself at runtime this is most likely because it ran
714out of dma_debug_entries. These entries are preallocated at boot. The number 726out of dma_debug_entries. These entries are preallocated at boot. The number
715of preallocated entries is defined per architecture. If it is too low for you 727of preallocated entries is defined per architecture. If it is too low for you
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index b1eb661e6302..9632444f6c62 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ 13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ 14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
15 mac80211.xml debugobjects.xml sh.xml regulator.xml \ 15 mac80211.xml debugobjects.xml sh.xml regulator.xml \
16 alsa-driver-api.xml writing-an-alsa-driver.xml 16 alsa-driver-api.xml writing-an-alsa-driver.xml \
17 tracepoint.xml
17 18
18### 19###
19# The build process is as follows (targets): 20# The build process is as follows (targets):
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 7f5f218015fe..08ff908aa7a2 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -106,7 +106,7 @@
106 number of errors are printk'ed including a full stack trace. 106 number of errors are printk'ed including a full stack trace.
107 </para> 107 </para>
108 <para> 108 <para>
109 The statistics are available via debugfs/debug_objects/stats. 109 The statistics are available via /sys/kernel/debug/debug_objects/stats.
110 They provide information about the number of warnings and the 110 They provide information about the number of warnings and the
111 number of successful fixups along with information about the 111 number of successful fixups along with information about the
112 usage of the internal tracking objects and the state of the 112 usage of the internal tracking objects and the state of the
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index fbeaffc1dcc3..e36986663570 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -145,7 +145,6 @@ usage should require reading the full document.
145 interface in STA mode at first! 145 interface in STA mode at first!
146 </para> 146 </para>
147!Finclude/net/mac80211.h ieee80211_if_init_conf 147!Finclude/net/mac80211.h ieee80211_if_init_conf
148!Finclude/net/mac80211.h ieee80211_if_conf
149 </chapter> 148 </chapter>
150 149
151 <chapter id="rx-tx"> 150 <chapter id="rx-tx">
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
new file mode 100644
index 000000000000..b0756d0fd579
--- /dev/null
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -0,0 +1,89 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="Tracepoints">
6 <bookinfo>
7 <title>The Linux Kernel Tracepoint API</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Jason</firstname>
12 <surname>Baron</surname>
13 <affiliation>
14 <address>
15 <email>jbaron@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <legalnotice>
22 <para>
23 This documentation is free software; you can redistribute
24 it and/or modify it under the terms of the GNU General Public
25 License as published by the Free Software Foundation; either
26 version 2 of the License, or (at your option) any later
27 version.
28 </para>
29
30 <para>
31 This program is distributed in the hope that it will be
32 useful, but WITHOUT ANY WARRANTY; without even the implied
33 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
34 See the GNU General Public License for more details.
35 </para>
36
37 <para>
38 You should have received a copy of the GNU General Public
39 License along with this program; if not, write to the Free
40 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
41 MA 02111-1307 USA
42 </para>
43
44 <para>
45 For more details see the file COPYING in the source
46 distribution of Linux.
47 </para>
48 </legalnotice>
49 </bookinfo>
50
51 <toc></toc>
52 <chapter id="intro">
53 <title>Introduction</title>
54 <para>
55 Tracepoints are static probe points that are located in strategic points
56 throughout the kernel. 'Probes' register/unregister with tracepoints
57 via a callback mechanism. The 'probes' are strictly typed functions that
58 are passed a unique set of parameters defined by each tracepoint.
59 </para>
60
61 <para>
62 From this simple callback mechanism, 'probes' can be used to profile, debug,
63 and understand kernel behavior. There are a number of tools that provide a
64 framework for using 'probes'. These tools include Systemtap, ftrace, and
65 LTTng.
66 </para>
67
68 <para>
69 Tracepoints are defined in a number of header files via various macros. Thus,
70 the purpose of this document is to provide a clear accounting of the available
71 tracepoints. The intention is to understand not only what tracepoints are
72 available but also to understand where future tracepoints might be added.
73 </para>
74
75 <para>
76 The API presented has functions of the form:
77 <function>trace_tracepointname(function parameters)</function>. These are the
78 tracepoints callbacks that are found throughout the code. Registering and
79 unregistering probes with these callback sites is covered in the
80 <filename>Documentation/trace/*</filename> directory.
81 </para>
82 </chapter>
83
84 <chapter id="irq">
85 <title>IRQ</title>
86!Iinclude/trace/events/irq.h
87 </chapter>
88
89</book>
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 6389dec33459..93cb28d05dcd 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -118,7 +118,7 @@ to another chain) checking the final 'nulls' value if
118the lookup met the end of chain. If final 'nulls' value 118the lookup met the end of chain. If final 'nulls' value
119is not the slot number, then we must restart the lookup at 119is not the slot number, then we must restart the lookup at
120the beginning. If the object was moved to the same chain, 120the beginning. If the object was moved to the same chain,
121then the reader doesnt care : It might eventually 121then the reader doesn't care : It might eventually
122scan the list again without harm. 122scan the list again without harm.
123 123
124 124
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 068848240a8b..02cced183b2d 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
192The output of "cat rcu/rcudata" looks as follows: 192The output of "cat rcu/rcudata" looks as follows:
193 193
194rcu: 194rcu:
195 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 195rcu:
196 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 196 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
197 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 197 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
198 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 198 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
199 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 199 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
200 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 200 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
201 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 201 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
202 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 202 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
203 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
203rcu_bh: 204rcu_bh:
204 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 205 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
205 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 206 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
206 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 207 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
207 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 208 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
208 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 209 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
209 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 210 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
210 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 211 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
211 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 212 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
212 213
213The first section lists the rcu_data structures for rcu, the second for 214The first section lists the rcu_data structures for rcu, the second for
214rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. 215rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o "pqc" indicates which grace period the last-observed quiescent
253o "qp" indicates that RCU still expects a quiescent state from 254o "qp" indicates that RCU still expects a quiescent state from
254 this CPU. 255 this CPU.
255 256
256o "rpfq" is the number of rcu_pending() calls on this CPU required
257 to induce this CPU to invoke force_quiescent_state().
258
259o "rp" is low-order four hex digits of the count of how many times
260 rcu_pending() has been invoked on this CPU.
261
262o "dt" is the current value of the dyntick counter that is incremented 257o "dt" is the current value of the dyntick counter that is incremented
263 when entering or leaving dynticks idle state, either by the 258 when entering or leaving dynticks idle state, either by the
264 scheduler or by irq. The number after the "/" is the interrupt 259 scheduler or by irq. The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o "b" is the batch limit for this CPU. If more than this number
305 of RCU callbacks is ready to invoke, then the remainder will 300 of RCU callbacks is ready to invoke, then the remainder will
306 be deferred. 301 be deferred.
307 302
303There is also an rcu/rcudata.csv file with the same information in
304comma-separated-variable spreadsheet format.
305
308 306
309The output of "cat rcu/rcugp" looks as follows: 307The output of "cat rcu/rcugp" looks as follows:
310 308
@@ -411,3 +409,63 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
411 For example, the first entry at the lowest level shows 409 For example, the first entry at the lowest level shows
412 "^0", indicating that it corresponds to bit zero in 410 "^0", indicating that it corresponds to bit zero in
413 the first entry at the middle level. 411 the first entry at the middle level.
412
413
414The output of "cat rcu/rcu_pending" looks as follows:
415
416rcu:
417 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
418 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
419 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
420 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
421 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
422 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
423 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
424 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
425rcu_bh:
426 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
427 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
428 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
429 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
430 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
431 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
432 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
433 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
434
435As always, this is once again split into "rcu" and "rcu_bh" portions.
436The fields are as follows:
437
438o "np" is the number of times that __rcu_pending() has been invoked
439 for the corresponding flavor of RCU.
440
441o "qsp" is the number of times that the RCU was waiting for a
442 quiescent state from this CPU.
443
444o "cbr" is the number of times that this CPU had RCU callbacks
445 that had passed through a grace period, and were thus ready
446 to be invoked.
447
448o "cng" is the number of times that this CPU needed another
449 grace period while RCU was idle.
450
451o "gpc" is the number of times that an old grace period had
452 completed, but this CPU was not yet aware of it.
453
454o "gps" is the number of times that a new grace period had started,
455 but this CPU was not yet aware of it.
456
457o "nf" is the number of times that this CPU suspected that the
458 current grace period had run for too long, and thus needed to
459 be forced.
460
461 Please note that "forcing" consists of sending resched IPIs
462 to holdout CPUs. If that CPU really still is in an old RCU
463 read-side critical section, then we really do have to wait for it.
464 The assumption behing "forcing" is that the CPU is not still in
465 an old RCU read-side critical section, but has not yet responded
466 for some other reason.
467
468o "nn" is the number of times that this CPU needed nothing. Alert
469 readers will note that the rcu "nn" number for a given CPU very
470 closely matches the rcu_bh "np" number for that same CPU. This
471 is due to short-circuit evaluation in rcu_pending().
diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt
index 6fc656035925..561826f82093 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/SM501.txt
@@ -5,7 +5,7 @@ Copyright 2006, 2007 Simtec Electronics
5 5
6The Silicon Motion SM501 multimedia companion chip is a multifunction device 6The Silicon Motion SM501 multimedia companion chip is a multifunction device
7which may provide numerous interfaces including USB host controller USB gadget, 7which may provide numerous interfaces including USB host controller USB gadget,
8Asyncronous Serial ports, Audio functions and a dual display video interface. 8asynchronous serial ports, audio functions, and a dual display video interface.
9The device may be connected by PCI or local bus with varying functions enabled. 9The device may be connected by PCI or local bus with varying functions enabled.
10 10
11Core 11Core
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 629c92e99783..34614b4c708e 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything
184other than a letter or digit, are reserved for use by the Smack development 184other than a letter or digit, are reserved for use by the Smack development
185team. Smack labels are unstructured, case sensitive, and the only operation 185team. Smack labels are unstructured, case sensitive, and the only operation
186ever performed on them is comparison for equality. Smack labels cannot 186ever performed on them is comparison for equality. Smack labels cannot
187contain unprintable characters or the "/" (slash) character. Smack labels 187contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
188cannot begin with a '-', which is reserved for special options. 188(quote) and '"' (double-quote) characters.
189Smack labels cannot begin with a '-', which is reserved for special options.
189 190
190There are some predefined labels: 191There are some predefined labels:
191 192
@@ -523,3 +524,18 @@ Smack supports some mount options:
523 524
524These mount options apply to all file system types. 525These mount options apply to all file system types.
525 526
527Smack auditing
528
529If you want Smack auditing of security events, you need to set CONFIG_AUDIT
530in your kernel configuration.
531By default, all denied events will be audited. You can change this behavior by
532writing a single character to the /smack/logging file :
5330 : no logging
5341 : log denied (default)
5352 : log accepted
5363 : log denied & accepted
537
538Events are logged as 'key=value' pairs, for each event you at least will get
539the subjet, the object, the rights requested, the action, the kernel function
540that triggered the event, plus other pairs depending on the type of event
541audited.
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index f309d3c6221c..5c555a8b39e5 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -91,6 +91,10 @@ Be as specific as possible. The WORST descriptions possible include
91things like "update driver X", "bug fix for driver X", or "this patch 91things like "update driver X", "bug fix for driver X", or "this patch
92includes updates for subsystem X. Please apply." 92includes updates for subsystem X. Please apply."
93 93
94The maintainer will thank you if you write your patch description in a
95form which can be easily pulled into Linux's source code management
96system, git, as a "commit log". See #15, below.
97
94If your description starts to get long, that's a sign that you probably 98If your description starts to get long, that's a sign that you probably
95need to split up your patch. See #3, next. 99need to split up your patch. See #3, next.
96 100
@@ -183,8 +187,9 @@ Even if the maintainer did not respond in step #4, make sure to ALWAYS
183copy the maintainer when you change their code. 187copy the maintainer when you change their code.
184 188
185For small patches you may want to CC the Trivial Patch Monkey 189For small patches you may want to CC the Trivial Patch Monkey
186trivial@kernel.org managed by Jesper Juhl; which collects "trivial" 190trivial@kernel.org which collects "trivial" patches. Have a look
187patches. Trivial patches must qualify for one of the following rules: 191into the MAINTAINERS file for its current manager.
192Trivial patches must qualify for one of the following rules:
188 Spelling fixes in documentation 193 Spelling fixes in documentation
189 Spelling fixes which could break grep(1) 194 Spelling fixes which could break grep(1)
190 Warning fixes (cluttering with useless warnings is bad) 195 Warning fixes (cluttering with useless warnings is bad)
@@ -196,7 +201,6 @@ patches. Trivial patches must qualify for one of the following rules:
196 since people copy, as long as it's trivial) 201 since people copy, as long as it's trivial)
197 Any fix by the author/maintainer of the file (ie. patch monkey 202 Any fix by the author/maintainer of the file (ie. patch monkey
198 in re-transmission mode) 203 in re-transmission mode)
199URL: <http://www.kernel.org/pub/linux/kernel/people/juhl/trivial/>
200 204
201 205
202 206
@@ -405,7 +409,14 @@ person it names. This tag documents that potentially interested parties
405have been included in the discussion 409have been included in the discussion
406 410
407 411
40814) Using Tested-by: and Reviewed-by: 41214) Using Reported-by:, Tested-by: and Reviewed-by:
413
414If this patch fixes a problem reported by somebody else, consider adding a
415Reported-by: tag to credit the reporter for their contribution. Please
416note that this tag should not be added without the reporter's permission,
417especially if the problem was not reported in a public forum. That said,
418if we diligently credit our bug reporters, they will, hopefully, be
419inspired to help us again in the future.
409 420
410A Tested-by: tag indicates that the patch has been successfully tested (in 421A Tested-by: tag indicates that the patch has been successfully tested (in
411some environment) by the person named. This tag informs maintainers that 422some environment) by the person named. This tag informs maintainers that
@@ -444,7 +455,7 @@ offer a Reviewed-by tag for a patch. This tag serves to give credit to
444reviewers and to inform maintainers of the degree of review which has been 455reviewers and to inform maintainers of the degree of review which has been
445done on the patch. Reviewed-by: tags, when supplied by reviewers known to 456done on the patch. Reviewed-by: tags, when supplied by reviewers known to
446understand the subject area and to perform thorough reviews, will normally 457understand the subject area and to perform thorough reviews, will normally
447increase the liklihood of your patch getting into the kernel. 458increase the likelihood of your patch getting into the kernel.
448 459
449 460
45015) The canonical patch format 46115) The canonical patch format
@@ -485,12 +496,33 @@ phrase" should not be a filename. Do not use the same "summary
485phrase" for every patch in a whole patch series (where a "patch 496phrase" for every patch in a whole patch series (where a "patch
486series" is an ordered sequence of multiple, related patches). 497series" is an ordered sequence of multiple, related patches).
487 498
488Bear in mind that the "summary phrase" of your email becomes 499Bear in mind that the "summary phrase" of your email becomes a
489a globally-unique identifier for that patch. It propagates 500globally-unique identifier for that patch. It propagates all the way
490all the way into the git changelog. The "summary phrase" may 501into the git changelog. The "summary phrase" may later be used in
491later be used in developer discussions which refer to the patch. 502developer discussions which refer to the patch. People will want to
492People will want to google for the "summary phrase" to read 503google for the "summary phrase" to read discussion regarding that
493discussion regarding that patch. 504patch. It will also be the only thing that people may quickly see
505when, two or three months later, they are going through perhaps
506thousands of patches using tools such as "gitk" or "git log
507--oneline".
508
509For these reasons, the "summary" must be no more than 70-75
510characters, and it must describe both what the patch changes, as well
511as why the patch might be necessary. It is challenging to be both
512succinct and descriptive, but that is what a well-written summary
513should do.
514
515The "summary phrase" may be prefixed by tags enclosed in square
516brackets: "Subject: [PATCH tag] <summary phrase>". The tags are not
517considered part of the summary phrase, but describe how the patch
518should be treated. Common tags might include a version descriptor if
519the multiple versions of the patch have been sent out in response to
520comments (i.e., "v1, v2, v3"), or "RFC" to indicate a request for
521comments. If there are four patches in a patch series the individual
522patches may be numbered like this: 1/4, 2/4, 3/4, 4/4. This assures
523that developers understand the order in which the patches should be
524applied and that they have reviewed or applied all of the patches in
525the patch series.
494 526
495A couple of example Subjects: 527A couple of example Subjects:
496 528
@@ -510,19 +542,31 @@ the patch author in the changelog.
510The explanation body will be committed to the permanent source 542The explanation body will be committed to the permanent source
511changelog, so should make sense to a competent reader who has long 543changelog, so should make sense to a competent reader who has long
512since forgotten the immediate details of the discussion that might 544since forgotten the immediate details of the discussion that might
513have led to this patch. 545have led to this patch. Including symptoms of the failure which the
546patch addresses (kernel log messages, oops messages, etc.) is
547especially useful for people who might be searching the commit logs
548looking for the applicable patch. If a patch fixes a compile failure,
549it may not be necessary to include _all_ of the compile failures; just
550enough that it is likely that someone searching for the patch can find
551it. As in the "summary phrase", it is important to be both succinct as
552well as descriptive.
514 553
515The "---" marker line serves the essential purpose of marking for patch 554The "---" marker line serves the essential purpose of marking for patch
516handling tools where the changelog message ends. 555handling tools where the changelog message ends.
517 556
518One good use for the additional comments after the "---" marker is for 557One good use for the additional comments after the "---" marker is for
519a diffstat, to show what files have changed, and the number of inserted 558a diffstat, to show what files have changed, and the number of
520and deleted lines per file. A diffstat is especially useful on bigger 559inserted and deleted lines per file. A diffstat is especially useful
521patches. Other comments relevant only to the moment or the maintainer, 560on bigger patches. Other comments relevant only to the moment or the
522not suitable for the permanent changelog, should also go here. 561maintainer, not suitable for the permanent changelog, should also go
523Use diffstat options "-p 1 -w 70" so that filenames are listed from the 562here. A good example of such comments might be "patch changelogs"
524top of the kernel source tree and don't use too much horizontal space 563which describe what has changed between the v1 and v2 version of the
525(easily fit in 80 columns, maybe with some indentation). 564patch.
565
566If you are going to include a diffstat after the "---" marker, please
567use diffstat options "-p 1 -w 70" so that filenames are listed from
568the top of the kernel source tree and don't use too much horizontal
569space (easily fit in 80 columns, maybe with some indentation).
526 570
527See more details on the proper patch format in the following 571See more details on the proper patch format in the following
528references. 572references.
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 7ea231172c85..aa73e72fd793 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -246,7 +246,8 @@ void print_ioacct(struct taskstats *t)
246 246
247int main(int argc, char *argv[]) 247int main(int argc, char *argv[])
248{ 248{
249 int c, rc, rep_len, aggr_len, len2, cmd_type; 249 int c, rc, rep_len, aggr_len, len2;
250 int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
250 __u16 id; 251 __u16 id;
251 __u32 mypid; 252 __u32 mypid;
252 253
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
index ea7ccfc4b274..948c8718d967 100644
--- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt
+++ b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
@@ -51,7 +51,7 @@ PIN Numbers
51----------- 51-----------
52 52
53 Each pin has an unique number associated with it in regs-gpio.h, 53 Each pin has an unique number associated with it in regs-gpio.h,
54 eg S3C2410_GPA0 or S3C2410_GPF1. These defines are used to tell 54 eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
55 the GPIO functions which pin is to be used. 55 the GPIO functions which pin is to be used.
56 56
57 57
@@ -65,11 +65,11 @@ Configuring a pin
65 65
66 Eg: 66 Eg:
67 67
68 s3c2410_gpio_cfgpin(S3C2410_GPA0, S3C2410_GPA0_ADDR0); 68 s3c2410_gpio_cfgpin(S3C2410_GPA(0), S3C2410_GPA0_ADDR0);
69 s3c2410_gpio_cfgpin(S3C2410_GPE8, S3C2410_GPE8_SDDAT1); 69 s3c2410_gpio_cfgpin(S3C2410_GPE(8), S3C2410_GPE8_SDDAT1);
70 70
71 which would turn GPA0 into the lowest Address line A0, and set 71 which would turn GPA(0) into the lowest Address line A0, and set
72 GPE8 to be connected to the SDIO/MMC controller's SDDAT1 line. 72 GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
73 73
74 74
75Reading the current configuration 75Reading the current configuration
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 4ef245010457..396bec3b74ed 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -229,10 +229,10 @@ kernel. It is the use of atomic counters to implement reference
229counting, and it works such that once the counter falls to zero it can 229counting, and it works such that once the counter falls to zero it can
230be guaranteed that no other entity can be accessing the object: 230be guaranteed that no other entity can be accessing the object:
231 231
232static void obj_list_add(struct obj *obj) 232static void obj_list_add(struct obj *obj, struct list_head *head)
233{ 233{
234 obj->active = 1; 234 obj->active = 1;
235 list_add(&obj->list); 235 list_add(&obj->list, head);
236} 236}
237 237
238static void obj_list_del(struct obj *obj) 238static void obj_list_del(struct obj *obj)
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6fab97ea7e6b..8d2158a1c6aa 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
186do not have a corresponding kernel virtual address space mapping) and 186do not have a corresponding kernel virtual address space mapping) and
187low-memory pages. 187low-memory pages.
188 188
189Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion 189Note: Please refer to Documentation/DMA-mapping.txt for a discussion
190on PCI high mem DMA aspects and mapping of scatter gather lists, and support 190on PCI high mem DMA aspects and mapping of scatter gather lists, and support
191for 64 bit PCI. 191for 64 bit PCI.
192 192
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index 72576769e0f4..2d82c80322cb 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -58,7 +58,7 @@ same criteria as reads.
58front_merges (bool) 58front_merges (bool)
59------------ 59------------
60 60
61Sometimes it happens that a request enters the io scheduler that is contigious 61Sometimes it happens that a request enters the io scheduler that is contiguous
62with a request that is already on the queue. Either it fits in the back of that 62with a request that is already on the queue. Either it fits in the back of that
63request, or it fits at the front. That is called either a back merge candidate 63request, or it fits at the front. That is called either a back merge candidate
64or a front merge candidate. Due to the way files are typically laid out, 64or a front merge candidate. Due to the way files are typically laid out,
diff --git a/Documentation/braille-console.txt b/Documentation/braille-console.txt
index 000b0fbdc105..d0d042c2fd5e 100644
--- a/Documentation/braille-console.txt
+++ b/Documentation/braille-console.txt
@@ -27,7 +27,7 @@ parameter.
27 27
28For simplicity, only one braille console can be enabled, other uses of 28For simplicity, only one braille console can be enabled, other uses of
29console=brl,... will be discarded. Also note that it does not interfere with 29console=brl,... will be discarded. Also note that it does not interfere with
30the console selection mecanism described in serial-console.txt 30the console selection mechanism described in serial-console.txt
31 31
32For now, only the VisioBraille device is supported. 32For now, only the VisioBraille device is supported.
33 33
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
index cf1f8126991c..1c407778c8b2 100644
--- a/Documentation/cdrom/packet-writing.txt
+++ b/Documentation/cdrom/packet-writing.txt
@@ -117,7 +117,7 @@ Using the pktcdvd debugfs interface
117 117
118To read pktcdvd device infos in human readable form, do: 118To read pktcdvd device infos in human readable form, do:
119 119
120 # cat /debug/pktcdvd/pktcdvd[0-7]/info 120 # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
121 121
122For a description of the debugfs interface look into the file: 122For a description of the debugfs interface look into the file:
123 123
diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt
index c11b931f8f98..15174985ad08 100644
--- a/Documentation/dell_rbu.txt
+++ b/Documentation/dell_rbu.txt
@@ -76,9 +76,9 @@ Do the steps below to download the BIOS image.
76 76
77The /sys/class/firmware/dell_rbu/ entries will remain till the following is 77The /sys/class/firmware/dell_rbu/ entries will remain till the following is
78done. 78done.
79echo -1 > /sys/class/firmware/dell_rbu/loading. 79echo -1 > /sys/class/firmware/dell_rbu/loading
80Until this step is completed the driver cannot be unloaded. 80Until this step is completed the driver cannot be unloaded.
81Also echoing either mono ,packet or init in to image_type will free up the 81Also echoing either mono, packet or init in to image_type will free up the
82memory allocated by the driver. 82memory allocated by the driver.
83 83
84If a user by accident executes steps 1 and 3 above without executing step 2; 84If a user by accident executes steps 1 and 3 above without executing step 2;
diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting
index dd48132a74dd..f622c1e9f0f9 100644
--- a/Documentation/development-process/5.Posting
+++ b/Documentation/development-process/5.Posting
@@ -119,7 +119,7 @@ which takes quite a bit of time and thought after the "real work" has been
119done. When done properly, though, it is time well spent. 119done. When done properly, though, it is time well spent.
120 120
121 121
1225.4: PATCH FORMATTING 1225.4: PATCH FORMATTING AND CHANGELOGS
123 123
124So now you have a perfect series of patches for posting, but the work is 124So now you have a perfect series of patches for posting, but the work is
125not done quite yet. Each patch needs to be formatted into a message which 125not done quite yet. Each patch needs to be formatted into a message which
@@ -146,8 +146,33 @@ that end, each patch will be composed of the following:
146 - One or more tag lines, with, at a minimum, one Signed-off-by: line from 146 - One or more tag lines, with, at a minimum, one Signed-off-by: line from
147 the author of the patch. Tags will be described in more detail below. 147 the author of the patch. Tags will be described in more detail below.
148 148
149The above three items should, normally, be the text used when committing 149The items above, together, form the changelog for the patch. Writing good
150the change to a revision control system. They are followed by: 150changelogs is a crucial but often-neglected art; it's worth spending
151another moment discussing this issue. When writing a changelog, you should
152bear in mind that a number of different people will be reading your words.
153These include subsystem maintainers and reviewers who need to decide
154whether the patch should be included, distributors and other maintainers
155trying to decide whether a patch should be backported to other kernels, bug
156hunters wondering whether the patch is responsible for a problem they are
157chasing, users who want to know how the kernel has changed, and more. A
158good changelog conveys the needed information to all of these people in the
159most direct and concise way possible.
160
161To that end, the summary line should describe the effects of and motivation
162for the change as well as possible given the one-line constraint. The
163detailed description can then amplify on those topics and provide any
164needed additional information. If the patch fixes a bug, cite the commit
165which introduced the bug if possible. If a problem is associated with
166specific log or compiler output, include that output to help others
167searching for a solution to the same problem. If the change is meant to
168support other changes coming in later patch, say so. If internal APIs are
169changed, detail those changes and how other developers should respond. In
170general, the more you can put yourself into the shoes of everybody who will
171be reading your changelog, the better that changelog (and the kernel as a
172whole) will be.
173
174Needless to say, the changelog should be the text used when committing the
175change to a revision control system. It will be followed by:
151 176
152 - The patch itself, in the unified ("-u") patch format. Using the "-p" 177 - The patch itself, in the unified ("-u") patch format. Using the "-p"
153 option to diff will associate function names with changes, making the 178 option to diff will associate function names with changes, making the
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
index a7cbfff40d07..a124f3126b0d 100644
--- a/Documentation/driver-model/device.txt
+++ b/Documentation/driver-model/device.txt
@@ -162,3 +162,35 @@ device_remove_file(dev,&dev_attr_power);
162 162
163The file name will be 'power' with a mode of 0644 (-rw-r--r--). 163The file name will be 'power' with a mode of 0644 (-rw-r--r--).
164 164
165Word of warning: While the kernel allows device_create_file() and
166device_remove_file() to be called on a device at any time, userspace has
167strict expectations on when attributes get created. When a new device is
168registered in the kernel, a uevent is generated to notify userspace (like
169udev) that a new device is available. If attributes are added after the
170device is registered, then userspace won't get notified and userspace will
171not know about the new attributes.
172
173This is important for device driver that need to publish additional
174attributes for a device at driver probe time. If the device driver simply
175calls device_create_file() on the device structure passed to it, then
176userspace will never be notified of the new attributes. Instead, it should
177probably use class_create() and class->dev_attrs to set up a list of
178desired attributes in the modules_init function, and then in the .probe()
179hook, and then use device_create() to create a new device as a child
180of the probed device. The new device will generate a new uevent and
181properly advertise the new attributes to userspace.
182
183For example, if a driver wanted to add the following attributes:
184struct device_attribute mydriver_attribs[] = {
185 __ATTR(port_count, 0444, port_count_show),
186 __ATTR(serial_number, 0444, serial_number_show),
187 NULL
188};
189
190Then in the module init function is would do:
191 mydriver_class = class_create(THIS_MODULE, "my_attrs");
192 mydriver_class.dev_attr = mydriver_attribs;
193
194And assuming 'dev' is the struct device passed into the probe hook, the driver
195probe function would do something like:
196 create_device(&mydriver_class, dev, chrdev, &private_data, "my_name");
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 387b8a720f4a..d79aead9418b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -188,7 +188,7 @@ For example, you can do something like the following.
188 188
189 void my_midlayer_destroy_something() 189 void my_midlayer_destroy_something()
190 { 190 {
191 devres_release_group(dev, my_midlayer_create_soemthing); 191 devres_release_group(dev, my_midlayer_create_something);
192 } 192 }
193 193
194 194
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index 2f21ecd4c205..a52adfc9a57f 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -112,7 +112,7 @@ sub tda10045 {
112 112
113sub tda10046 { 113sub tda10046 {
114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip"; 114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip";
115 my $url = "http://technotrend-online.com/download/software/219/$sourcefile"; 115 my $url = "http://www.tt-download.com/download/updates/219/$sourcefile";
116 my $hash = "6a7e1e2f2644b162ff0502367553c72d"; 116 my $hash = "6a7e1e2f2644b162ff0502367553c72d";
117 my $outfile = "dvb-fe-tda10046.fw"; 117 my $outfile = "dvb-fe-tda10046.fw";
118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -129,8 +129,8 @@ sub tda10046 {
129} 129}
130 130
131sub tda10046lifeview { 131sub tda10046lifeview {
132 my $sourcefile = "Drv_2.11.02.zip"; 132 my $sourcefile = "7%5Cdrv_2.11.02.zip";
133 my $url = "http://www.lifeview.com.tw/drivers/pci_card/FlyDVB-T/$sourcefile"; 133 my $url = "http://www.lifeview.hk/dbimages/document/$sourcefile";
134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0"; 134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0";
135 my $outfile = "dvb-fe-tda10046.fw"; 135 my $outfile = "dvb-fe-tda10046.fw";
136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -317,7 +317,7 @@ sub nxt2002 {
317 317
318sub nxt2004 { 318sub nxt2004 {
319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip"; 319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip";
320 my $url = "http://www.aver.com/support/Drivers/$sourcefile"; 320 my $url = "http://www.avermedia-usa.com/support/Drivers/$sourcefile";
321 my $hash = "111cb885b1e009188346d72acfed024c"; 321 my $hash = "111cb885b1e009188346d72acfed024c";
322 my $outfile = "dvb-fe-nxt2004.fw"; 322 my $outfile = "dvb-fe-nxt2004.fw";
323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
diff --git a/Documentation/edac.txt b/Documentation/edac.txt
index 8eda3fb66416..06f8f46692dc 100644
--- a/Documentation/edac.txt
+++ b/Documentation/edac.txt
@@ -23,8 +23,8 @@ first time, it was renamed to 'EDAC'.
23The bluesmoke project at sourceforge.net is now utilized as a 'staging area' 23The bluesmoke project at sourceforge.net is now utilized as a 'staging area'
24for EDAC development, before it is sent upstream to kernel.org 24for EDAC development, before it is sent upstream to kernel.org
25 25
26At the bluesmoke/EDAC project site, is a series of quilt patches against 26At the bluesmoke/EDAC project site is a series of quilt patches against
27recent kernels, stored in a SVN respository. For easier downloading, there 27recent kernels, stored in a SVN repository. For easier downloading, there
28is also a tarball snapshot available. 28is also a tarball snapshot available.
29 29
30============================================================================ 30============================================================================
@@ -73,9 +73,9 @@ the vendor should tie the parity status bits to 0 if they do not intend
73to generate parity. Some vendors do not do this, and thus the parity bit 73to generate parity. Some vendors do not do this, and thus the parity bit
74can "float" giving false positives. 74can "float" giving false positives.
75 75
76In the kernel there is a pci device attribute located in sysfs that is 76In the kernel there is a PCI device attribute located in sysfs that is
77checked by the EDAC PCI scanning code. If that attribute is set, 77checked by the EDAC PCI scanning code. If that attribute is set,
78PCI parity/error scannining is skipped for that device. The attribute 78PCI parity/error scanning is skipped for that device. The attribute
79is: 79is:
80 80
81 broken_parity_status 81 broken_parity_status
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 4bc374a14345..079305640790 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -29,16 +29,16 @@ o debugfs entries
29fault-inject-debugfs kernel module provides some debugfs entries for runtime 29fault-inject-debugfs kernel module provides some debugfs entries for runtime
30configuration of fault-injection capabilities. 30configuration of fault-injection capabilities.
31 31
32- /debug/fail*/probability: 32- /sys/kernel/debug/fail*/probability:
33 33
34 likelihood of failure injection, in percent. 34 likelihood of failure injection, in percent.
35 Format: <percent> 35 Format: <percent>
36 36
37 Note that one-failure-per-hundred is a very high error rate 37 Note that one-failure-per-hundred is a very high error rate
38 for some testcases. Consider setting probability=100 and configure 38 for some testcases. Consider setting probability=100 and configure
39 /debug/fail*/interval for such testcases. 39 /sys/kernel/debug/fail*/interval for such testcases.
40 40
41- /debug/fail*/interval: 41- /sys/kernel/debug/fail*/interval:
42 42
43 specifies the interval between failures, for calls to 43 specifies the interval between failures, for calls to
44 should_fail() that pass all the other tests. 44 should_fail() that pass all the other tests.
@@ -46,18 +46,18 @@ configuration of fault-injection capabilities.
46 Note that if you enable this, by setting interval>1, you will 46 Note that if you enable this, by setting interval>1, you will
47 probably want to set probability=100. 47 probably want to set probability=100.
48 48
49- /debug/fail*/times: 49- /sys/kernel/debug/fail*/times:
50 50
51 specifies how many times failures may happen at most. 51 specifies how many times failures may happen at most.
52 A value of -1 means "no limit". 52 A value of -1 means "no limit".
53 53
54- /debug/fail*/space: 54- /sys/kernel/debug/fail*/space:
55 55
56 specifies an initial resource "budget", decremented by "size" 56 specifies an initial resource "budget", decremented by "size"
57 on each call to should_fail(,size). Failure injection is 57 on each call to should_fail(,size). Failure injection is
58 suppressed until "space" reaches zero. 58 suppressed until "space" reaches zero.
59 59
60- /debug/fail*/verbose 60- /sys/kernel/debug/fail*/verbose
61 61
62 Format: { 0 | 1 | 2 } 62 Format: { 0 | 1 | 2 }
63 specifies the verbosity of the messages when failure is 63 specifies the verbosity of the messages when failure is
@@ -65,17 +65,17 @@ configuration of fault-injection capabilities.
65 log line per failure; '2' will print a call trace too -- useful 65 log line per failure; '2' will print a call trace too -- useful
66 to debug the problems revealed by fault injection. 66 to debug the problems revealed by fault injection.
67 67
68- /debug/fail*/task-filter: 68- /sys/kernel/debug/fail*/task-filter:
69 69
70 Format: { 'Y' | 'N' } 70 Format: { 'Y' | 'N' }
71 A value of 'N' disables filtering by process (default). 71 A value of 'N' disables filtering by process (default).
72 Any positive value limits failures to only processes indicated by 72 Any positive value limits failures to only processes indicated by
73 /proc/<pid>/make-it-fail==1. 73 /proc/<pid>/make-it-fail==1.
74 74
75- /debug/fail*/require-start: 75- /sys/kernel/debug/fail*/require-start:
76- /debug/fail*/require-end: 76- /sys/kernel/debug/fail*/require-end:
77- /debug/fail*/reject-start: 77- /sys/kernel/debug/fail*/reject-start:
78- /debug/fail*/reject-end: 78- /sys/kernel/debug/fail*/reject-end:
79 79
80 specifies the range of virtual addresses tested during 80 specifies the range of virtual addresses tested during
81 stacktrace walking. Failure is injected only if some caller 81 stacktrace walking. Failure is injected only if some caller
@@ -84,26 +84,26 @@ configuration of fault-injection capabilities.
84 Default required range is [0,ULONG_MAX) (whole of virtual address space). 84 Default required range is [0,ULONG_MAX) (whole of virtual address space).
85 Default rejected range is [0,0). 85 Default rejected range is [0,0).
86 86
87- /debug/fail*/stacktrace-depth: 87- /sys/kernel/debug/fail*/stacktrace-depth:
88 88
89 specifies the maximum stacktrace depth walked during search 89 specifies the maximum stacktrace depth walked during search
90 for a caller within [require-start,require-end) OR 90 for a caller within [require-start,require-end) OR
91 [reject-start,reject-end). 91 [reject-start,reject-end).
92 92
93- /debug/fail_page_alloc/ignore-gfp-highmem: 93- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
94 94
95 Format: { 'Y' | 'N' } 95 Format: { 'Y' | 'N' }
96 default is 'N', setting it to 'Y' won't inject failures into 96 default is 'N', setting it to 'Y' won't inject failures into
97 highmem/user allocations. 97 highmem/user allocations.
98 98
99- /debug/failslab/ignore-gfp-wait: 99- /sys/kernel/debug/failslab/ignore-gfp-wait:
100- /debug/fail_page_alloc/ignore-gfp-wait: 100- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
101 101
102 Format: { 'Y' | 'N' } 102 Format: { 'Y' | 'N' }
103 default is 'N', setting it to 'Y' will inject failures 103 default is 'N', setting it to 'Y' will inject failures
104 only into non-sleep allocations (GFP_ATOMIC allocations). 104 only into non-sleep allocations (GFP_ATOMIC allocations).
105 105
106- /debug/fail_page_alloc/min-order: 106- /sys/kernel/debug/fail_page_alloc/min-order:
107 107
108 specifies the minimum page allocation order to be injected 108 specifies the minimum page allocation order to be injected
109 failures. 109 failures.
@@ -166,13 +166,13 @@ o Inject slab allocation failures into module init/exit code
166#!/bin/bash 166#!/bin/bash
167 167
168FAILTYPE=failslab 168FAILTYPE=failslab
169echo Y > /debug/$FAILTYPE/task-filter 169echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
170echo 10 > /debug/$FAILTYPE/probability 170echo 10 > /sys/kernel/debug/$FAILTYPE/probability
171echo 100 > /debug/$FAILTYPE/interval 171echo 100 > /sys/kernel/debug/$FAILTYPE/interval
172echo -1 > /debug/$FAILTYPE/times 172echo -1 > /sys/kernel/debug/$FAILTYPE/times
173echo 0 > /debug/$FAILTYPE/space 173echo 0 > /sys/kernel/debug/$FAILTYPE/space
174echo 2 > /debug/$FAILTYPE/verbose 174echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
175echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 175echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
176 176
177faulty_system() 177faulty_system()
178{ 178{
@@ -217,20 +217,20 @@ then
217 exit 1 217 exit 1
218fi 218fi
219 219
220cat /sys/module/$module/sections/.text > /debug/$FAILTYPE/require-start 220cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
221cat /sys/module/$module/sections/.data > /debug/$FAILTYPE/require-end 221cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
222 222
223echo N > /debug/$FAILTYPE/task-filter 223echo N > /sys/kernel/debug/$FAILTYPE/task-filter
224echo 10 > /debug/$FAILTYPE/probability 224echo 10 > /sys/kernel/debug/$FAILTYPE/probability
225echo 100 > /debug/$FAILTYPE/interval 225echo 100 > /sys/kernel/debug/$FAILTYPE/interval
226echo -1 > /debug/$FAILTYPE/times 226echo -1 > /sys/kernel/debug/$FAILTYPE/times
227echo 0 > /debug/$FAILTYPE/space 227echo 0 > /sys/kernel/debug/$FAILTYPE/space
228echo 2 > /debug/$FAILTYPE/verbose 228echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
229echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 229echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
230echo 1 > /debug/$FAILTYPE/ignore-gfp-highmem 230echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
231echo 10 > /debug/$FAILTYPE/stacktrace-depth 231echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
232 232
233trap "echo 0 > /debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT 233trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
234 234
235echo "Injecting errors into the module $module... (interrupt to stop)" 235echo "Injecting errors into the module $module... (interrupt to stop)"
236sleep 1000000 236sleep 1000000
diff --git a/Documentation/fb/sh7760fb.txt b/Documentation/fb/sh7760fb.txt
index c87bfe5c630a..b994c3b10549 100644
--- a/Documentation/fb/sh7760fb.txt
+++ b/Documentation/fb/sh7760fb.txt
@@ -1,7 +1,7 @@
1SH7760/SH7763 integrated LCDC Framebuffer driver 1SH7760/SH7763 integrated LCDC Framebuffer driver
2================================================ 2================================================
3 3
40. Overwiew 40. Overview
5----------- 5-----------
6The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which 6The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which
7supports (in theory) resolutions ranging from 1x1 to 1024x1024, 7supports (in theory) resolutions ranging from 1x1 to 1024x1024,
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
index ee277dd204b0..950d5a658cb3 100644
--- a/Documentation/fb/vesafb.txt
+++ b/Documentation/fb/vesafb.txt
@@ -95,7 +95,7 @@ There is no way to change the vesafb video mode and/or timings after
95booting linux. If you are not happy with the 60 Hz refresh rate, you 95booting linux. If you are not happy with the 60 Hz refresh rate, you
96have these options: 96have these options:
97 97
98 * configure and load the DOS-Tools for your the graphics board (if 98 * configure and load the DOS-Tools for the graphics board (if
99 available) and boot linux with loadlin. 99 available) and boot linux with loadlin.
100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none 100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none
101 is available, write a new one! 101 is available, write a new one!
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index de491a3e2313..7129846a2785 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -437,3 +437,20 @@ Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate
437 driver but this caused driver conflicts. 437 driver but this caused driver conflicts.
438Who: Jean Delvare <khali@linux-fr.org> 438Who: Jean Delvare <khali@linux-fr.org>
439 Krzysztof Helt <krzysztof.h1@wp.pl> 439 Krzysztof Helt <krzysztof.h1@wp.pl>
440
441---------------------------
442
443What: CONFIG_RFKILL_INPUT
444When: 2.6.33
445Why: Should be implemented in userspace, policy daemon.
446Who: Johannes Berg <johannes@sipsolutions.net>
447
448----------------------------
449
450What: CONFIG_X86_OLD_MCE
451When: 2.6.32
452Why: Remove the old legacy 32bit machine check code. This has been
453 superseded by the newer machine check code from the 64bit port,
454 but the old version has been kept around for easier testing. Note this
455 doesn't impact the old P5 and WinChip machine check handlers.
456Who: Andi Kleen <andi@firstfloor.org>
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3120f8dd2c31..229d7b7c50a3 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -187,7 +187,7 @@ readpages: no
187write_begin: no locks the page yes 187write_begin: no locks the page yes
188write_end: no yes, unlocks yes 188write_end: no yes, unlocks yes
189perform_write: no n/a yes 189perform_write: no n/a yes
190bmap: yes 190bmap: no
191invalidatepage: no yes 191invalidatepage: no yes
192releasepage: no yes 192releasepage: no yes
193direct_IO: no 193direct_IO: no
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index c6341745df37..8f78ded4b648 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -369,7 +369,7 @@ The call requires an initialized struct autofs_dev_ioctl. There are two
369possible variations. Both use the path field set to the path of the mount 369possible variations. Both use the path field set to the path of the mount
370point to check and the size field adjusted appropriately. One uses the 370point to check and the size field adjusted appropriately. One uses the
371ioctlfd field to identify a specific mount point to check while the other 371ioctlfd field to identify a specific mount point to check while the other
372variation uses the path and optionaly arg1 set to an autofs mount type. 372variation uses the path and optionally arg1 set to an autofs mount type.
373The call returns 1 if this is a mount point and sets arg1 to the device 373The call returns 1 if this is a mount point and sets arg1 to the device
374number of the mount and field arg2 to the relevant super block magic 374number of the mount and field arg2 to the relevant super block magic
375number (described below) or 0 if it isn't a mountpoint. In both cases 375number (described below) or 0 if it isn't a mountpoint. In both cases
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 4db125b3a5c6..2666b1ed5e9e 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -184,7 +184,7 @@ This has the following fields:
184 have index children. 184 have index children.
185 185
186 If this function is not supplied or if it returns NULL then the first 186 If this function is not supplied or if it returns NULL then the first
187 cache in the parent's list will be chosed, or failing that, the first 187 cache in the parent's list will be chosen, or failing that, the first
188 cache in the master list. 188 cache in the master list.
189 189
190 (4) A function to retrieve an object's key from the netfs [mandatory]. 190 (4) A function to retrieve an object's key from the netfs [mandatory].
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
new file mode 100644
index 000000000000..ed52af60c2d8
--- /dev/null
+++ b/Documentation/filesystems/debugfs.txt
@@ -0,0 +1,158 @@
1Copyright 2009 Jonathan Corbet <corbet@lwn.net>
2
3Debugfs exists as a simple way for kernel developers to make information
4available to user space. Unlike /proc, which is only meant for information
5about a process, or sysfs, which has strict one-value-per-file rules,
6debugfs has no rules at all. Developers can put any information they want
7there. The debugfs filesystem is also intended to not serve as a stable
8ABI to user space; in theory, there are no stability constraints placed on
9files exported there. The real world is not always so simple, though [1];
10even debugfs interfaces are best designed with the idea that they will need
11to be maintained forever.
12
13Debugfs is typically mounted with a command like:
14
15 mount -t debugfs none /sys/kernel/debug
16
17(Or an equivalent /etc/fstab line).
18
19Note that the debugfs API is exported GPL-only to modules.
20
21Code using debugfs should include <linux/debugfs.h>. Then, the first order
22of business will be to create at least one directory to hold a set of
23debugfs files:
24
25 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
26
27This call, if successful, will make a directory called name underneath the
28indicated parent directory. If parent is NULL, the directory will be
29created in the debugfs root. On success, the return value is a struct
30dentry pointer which can be used to create files in the directory (and to
31clean it up at the end). A NULL return value indicates that something went
32wrong. If ERR_PTR(-ENODEV) is returned, that is an indication that the
33kernel has been built without debugfs support and none of the functions
34described below will work.
35
36The most general way to create a file within a debugfs directory is with:
37
38 struct dentry *debugfs_create_file(const char *name, mode_t mode,
39 struct dentry *parent, void *data,
40 const struct file_operations *fops);
41
42Here, name is the name of the file to create, mode describes the access
43permissions the file should have, parent indicates the directory which
44should hold the file, data will be stored in the i_private field of the
45resulting inode structure, and fops is a set of file operations which
46implement the file's behavior. At a minimum, the read() and/or write()
47operations should be provided; others can be included as needed. Again,
48the return value will be a dentry pointer to the created file, NULL for
49error, or ERR_PTR(-ENODEV) if debugfs support is missing.
50
51In a number of cases, the creation of a set of file operations is not
52actually necessary; the debugfs code provides a number of helper functions
53for simple situations. Files containing a single integer value can be
54created with any of:
55
56 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
57 struct dentry *parent, u8 *value);
58 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
59 struct dentry *parent, u16 *value);
60 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
61 struct dentry *parent, u32 *value);
62 struct dentry *debugfs_create_u64(const char *name, mode_t mode,
63 struct dentry *parent, u64 *value);
64
65These files support both reading and writing the given value; if a specific
66file should not be written to, simply set the mode bits accordingly. The
67values in these files are in decimal; if hexadecimal is more appropriate,
68the following functions can be used instead:
69
70 struct dentry *debugfs_create_x8(const char *name, mode_t mode,
71 struct dentry *parent, u8 *value);
72 struct dentry *debugfs_create_x16(const char *name, mode_t mode,
73 struct dentry *parent, u16 *value);
74 struct dentry *debugfs_create_x32(const char *name, mode_t mode,
75 struct dentry *parent, u32 *value);
76
77Note that there is no debugfs_create_x64().
78
79These functions are useful as long as the developer knows the size of the
80value to be exported. Some types can have different widths on different
81architectures, though, complicating the situation somewhat. There is a
82function meant to help out in one special case:
83
84 struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
85 struct dentry *parent,
86 size_t *value);
87
88As might be expected, this function will create a debugfs file to represent
89a variable of type size_t.
90
91Boolean values can be placed in debugfs with:
92
93 struct dentry *debugfs_create_bool(const char *name, mode_t mode,
94 struct dentry *parent, u32 *value);
95
96A read on the resulting file will yield either Y (for non-zero values) or
97N, followed by a newline. If written to, it will accept either upper- or
98lower-case values, or 1 or 0. Any other input will be silently ignored.
99
100Finally, a block of arbitrary binary data can be exported with:
101
102 struct debugfs_blob_wrapper {
103 void *data;
104 unsigned long size;
105 };
106
107 struct dentry *debugfs_create_blob(const char *name, mode_t mode,
108 struct dentry *parent,
109 struct debugfs_blob_wrapper *blob);
110
111A read of this file will return the data pointed to by the
112debugfs_blob_wrapper structure. Some drivers use "blobs" as a simple way
113to return several lines of (static) formatted text output. This function
114can be used to export binary information, but there does not appear to be
115any code which does so in the mainline. Note that all files created with
116debugfs_create_blob() are read-only.
117
118There are a couple of other directory-oriented helper functions:
119
120 struct dentry *debugfs_rename(struct dentry *old_dir,
121 struct dentry *old_dentry,
122 struct dentry *new_dir,
123 const char *new_name);
124
125 struct dentry *debugfs_create_symlink(const char *name,
126 struct dentry *parent,
127 const char *target);
128
129A call to debugfs_rename() will give a new name to an existing debugfs
130file, possibly in a different directory. The new_name must not exist prior
131to the call; the return value is old_dentry with updated information.
132Symbolic links can be created with debugfs_create_symlink().
133
134There is one important thing that all debugfs users must take into account:
135there is no automatic cleanup of any directories created in debugfs. If a
136module is unloaded without explicitly removing debugfs entries, the result
137will be a lot of stale pointers and no end of highly antisocial behavior.
138So all debugfs users - at least those which can be built as modules - must
139be prepared to remove all files and directories they create there. A file
140can be removed with:
141
142 void debugfs_remove(struct dentry *dentry);
143
144The dentry value can be NULL, in which case nothing will be removed.
145
146Once upon a time, debugfs users were required to remember the dentry
147pointer for every debugfs file they created so that all files could be
148cleaned up. We live in more civilized times now, though, and debugfs users
149can call:
150
151 void debugfs_remove_recursive(struct dentry *dentry);
152
153If this function is passed a pointer for the dentry corresponding to the
154top-level directory, the entire hierarchy below that directory will be
155removed.
156
157Notes:
158 [1] http://lwn.net/Articles/309298/
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 97882df04865..608fdba97b72 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -294,7 +294,7 @@ max_batch_time=usec Maximum amount of time ext4 should wait for
294 amount of time (on average) that it takes to 294 amount of time (on average) that it takes to
295 finish committing a transaction. Call this time 295 finish committing a transaction. Call this time
296 the "commit time". If the time that the 296 the "commit time". If the time that the
297 transactoin has been running is less than the 297 transaction has been running is less than the
298 commit time, ext4 will try sleeping for the 298 commit time, ext4 will try sleeping for the
299 commit time to see if other operations will join 299 commit time to see if other operations will join
300 the transaction. The commit time is capped by 300 the transaction. The commit time is capped by
@@ -328,7 +328,7 @@ noauto_da_alloc replacing existing files via patterns such as
328 journal commit, in the default data=ordered 328 journal commit, in the default data=ordered
329 mode, the data blocks of the new file are forced 329 mode, the data blocks of the new file are forced
330 to disk before the rename() operation is 330 to disk before the rename() operation is
331 commited. This provides roughly the same level 331 committed. This provides roughly the same level
332 of guarantees as ext3, and avoids the 332 of guarantees as ext3, and avoids the
333 "zero-length" problem that can happen when a 333 "zero-length" problem that can happen when a
334 system crashes before the delayed allocation 334 system crashes before the delayed allocation
@@ -358,7 +358,7 @@ written to the journal first, and then to its final location.
358In the event of a crash, the journal can be replayed, bringing both data and 358In the event of a crash, the journal can be replayed, bringing both data and
359metadata into a consistent state. This mode is the slowest except when data 359metadata into a consistent state. This mode is the slowest except when data
360needs to be read from and written to disk at the same time where it 360needs to be read from and written to disk at the same time where it
361outperforms all others modes. Curently ext4 does not have delayed 361outperforms all others modes. Currently ext4 does not have delayed
362allocation support if this data journalling mode is selected. 362allocation support if this data journalling mode is selected.
363 363
364References 364References
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
index 1e3defcfe50b..606233cd4618 100644
--- a/Documentation/filesystems/fiemap.txt
+++ b/Documentation/filesystems/fiemap.txt
@@ -204,7 +204,7 @@ fiemap_check_flags() helper:
204 204
205int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); 205int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
206 206
207The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The 207The struct fieinfo should be passed in as received from ioctl_fiemap(). The
208set of fiemap flags which the fs understands should be passed via fs_flags. If 208set of fiemap flags which the fs understands should be passed via fs_flags. If
209fiemap_check_flags finds invalid user flags, it will place the bad values in 209fiemap_check_flags finds invalid user flags, it will place the bad values in
210fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from 210fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
index 4dae9a3840bf..0494f78d87e4 100644
--- a/Documentation/filesystems/gfs2-glocks.txt
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -60,7 +60,7 @@ go_lock | Called for the first local holder of a lock
60go_unlock | Called on the final local unlock of a lock 60go_unlock | Called on the final local unlock of a lock
61go_dump | Called to print content of object for debugfs file, or on 61go_dump | Called to print content of object for debugfs file, or on
62 | error to dump glock to the log. 62 | error to dump glock to the log.
63go_type; | The type of the glock, LM_TYPE_..... 63go_type | The type of the glock, LM_TYPE_.....
64go_min_hold_time | The minimum hold time 64go_min_hold_time | The minimum hold time
65 65
66The minimum hold time for each lock is the time after a remote lock 66The minimum hold time for each lock is the time after a remote lock
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 593004b6bbab..5e3ab8f3beff 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -11,18 +11,15 @@ their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system 11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster. 12on one machine show up immediately on all other machines in the cluster.
13 13
14GFS uses interchangable inter-node locking mechanisms. Different lock 14GFS uses interchangable inter-node locking mechanisms, the currently
15modules can plug into GFS and each file system selects the appropriate 15supported mechanisms are:
16lock module at mount time. Lock modules include:
17 16
18 lock_nolock -- allows gfs to be used as a local file system 17 lock_nolock -- allows gfs to be used as a local file system
19 18
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking 19 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/ 20 The dlm is found at linux/fs/dlm/
22 21
23In addition to interfacing with an external locking manager, a gfs lock 22Lock_dlm depends on user space cluster management systems found
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above. 23at the URL above.
27 24
28To use gfs as a local file system, no external clustering systems are 25To use gfs as a local file system, no external clustering systems are
@@ -31,13 +28,19 @@ needed, simply:
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device 28 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir 29 $ mount -t gfs2 /dev/block_device /dir
33 30
34GFS2 is not on-disk compatible with previous versions of GFS. 31If you are using Fedora, you need to install the gfs2-utils package
32and, for lock_dlm, you will also need to install the cman package
33and write a cluster.conf as per the documentation.
34
35GFS2 is not on-disk compatible with previous versions of GFS, but it
36is pretty close.
35 37
36The following man pages can be found at the URL above: 38The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem 39 fsck.gfs2 to repair a filesystem
38 gfs2_grow to expand a filesystem online 40 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online 41 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem 42 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem 43 gfs2_quota to examine and change quota values in a filesystem
44 gfs2_convert to convert a gfs filesystem to gfs2 in-place
42 mount.gfs2 to help mount(8) mount a filesystem 45 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem 46 mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt
index 85eaeaddd27c..e386f7e4bcee 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs-rdma.txt
@@ -100,7 +100,7 @@ Installation
100 $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs 100 $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
101 101
102 In this location, mount.nfs will be invoked automatically for NFS mounts 102 In this location, mount.nfs will be invoked automatically for NFS mounts
103 by the system mount commmand. 103 by the system mount command.
104 104
105 NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed 105 NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
106 on the NFS client machine. You do not need this specific version of 106 on the NFS client machine. You do not need this specific version of
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 55c4300abfcb..01539f410676 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -39,9 +39,8 @@ Features which NILFS2 does not support yet:
39 - extended attributes 39 - extended attributes
40 - POSIX ACLs 40 - POSIX ACLs
41 - quotas 41 - quotas
42 - writable snapshots 42 - fsck
43 - remote backup (CDP) 43 - resize
44 - data integrity
45 - defragmentation 44 - defragmentation
46 45
47Mount options 46Mount options
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ce84cfc9eae0..ebff3c10a07f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -366,7 +366,7 @@ just those considered 'most important'. The new vectors are:
366 RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are 366 RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are
367 sent from one CPU to another per the needs of the OS. Typically, 367 sent from one CPU to another per the needs of the OS. Typically,
368 their statistics are used by kernel developers and interested users to 368 their statistics are used by kernel developers and interested users to
369 determine the occurance of interrupt of the given type. 369 determine the occurrence of interrupts of the given type.
370 370
371The above IRQ vectors are displayed only when relevent. For example, 371The above IRQ vectors are displayed only when relevent. For example,
372the threshold vector does not exist on x86_64 platforms. Others are 372the threshold vector does not exist on x86_64 platforms. Others are
@@ -551,7 +551,7 @@ Committed_AS: The amount of memory presently allocated on the system.
551 memory once that memory has been successfully allocated. 551 memory once that memory has been successfully allocated.
552VmallocTotal: total size of vmalloc memory area 552VmallocTotal: total size of vmalloc memory area
553 VmallocUsed: amount of vmalloc area which is used 553 VmallocUsed: amount of vmalloc area which is used
554VmallocChunk: largest contigious block of vmalloc area which is free 554VmallocChunk: largest contiguous block of vmalloc area which is free
555 555
556.............................................................................. 556..............................................................................
557 557
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score 10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
1004------------------------------------------------------ 1004------------------------------------------------------
1005 1005
1006This file can be used to adjust the score used to select which processes 1006This file can be used to adjust the score used to select which processes should
1007should be killed in an out-of-memory situation. Giving it a high score will 1007be killed in an out-of-memory situation. The oom_adj value is a characteristic
1008increase the likelihood of this process being killed by the oom-killer. Valid 1008of the task's mm, so all threads that share an mm with pid will have the same
1009values are in the range -16 to +15, plus the special value -17, which disables 1009oom_adj value. A high value will increase the likelihood of this process being
1010oom-killing altogether for this process. 1010killed by the oom-killer. Valid values are in the range -16 to +15 as
1011explained below and a special value of -17, which disables oom-killing
1012altogether for threads sharing pid's mm.
1011 1013
1012The process to be killed in an out-of-memory situation is selected among all others 1014The process to be killed in an out-of-memory situation is selected among all others
1013based on its badness score. This value equals the original memory size of the process 1015based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
1021are the prime candidates to be killed. Having only one 'hungry' child will make 1023are the prime candidates to be killed. Having only one 'hungry' child will make
1022parent less preferable than the child. 1024parent less preferable than the child.
1023 1025
1026/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
1027oom-killing already.
1028
1024/proc/<pid>/oom_score shows process' current badness score. 1029/proc/<pid>/oom_score shows process' current badness score.
1025 1030
1026The following heuristics are then applied: 1031The following heuristics are then applied:
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 26e4b8bc53ee..85354b32d731 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -72,7 +72,7 @@ The 'rom' file is special in that it provides read-only access to the device's
72ROM file, if available. It's disabled by default, however, so applications 72ROM file, if available. It's disabled by default, however, so applications
73should write the string "1" to the file to enable it before attempting a read 73should write the string "1" to the file to enable it before attempting a read
74call, and disable it following the access by writing "0" to the file. Note 74call, and disable it following the access by writing "0" to the file. Note
75that the device must be enabled for a rom read to return data succesfully. 75that the device must be enabled for a rom read to return data successfully.
76In the event a driver is not bound to the device, it can be enabled using the 76In the event a driver is not bound to the device, it can be enabled using the
77'enable' file, documented above. 77'enable' file, documented above.
78 78
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 222437efd75a..3015da0c6b2a 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -133,4 +133,4 @@ RAM/SWAP in 10240 inodes and it is only accessible by root.
133Author: 133Author:
134 Christoph Rohland <cr@sap.com>, 1.12.01 134 Christoph Rohland <cr@sap.com>, 1.12.01
135Updated: 135Updated:
136 Hugh Dickins <hugh@veritas.com>, 4 June 2007 136 Hugh Dickins, 4 June 2007
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 3a5ddc96901a..b58b84b50fa2 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -124,14 +124,19 @@ sys_immutable -- If set, ATTR_SYS attribute on FAT is handled as
124flush -- If set, the filesystem will try to flush to disk more 124flush -- If set, the filesystem will try to flush to disk more
125 early than normal. Not set by default. 125 early than normal. Not set by default.
126 126
127rodir -- FAT has the ATTR_RO (read-only) attribute. But on Windows, 127rodir -- FAT has the ATTR_RO (read-only) attribute. On Windows,
128 the ATTR_RO of the directory will be just ignored actually, 128 the ATTR_RO of the directory will just be ignored,
129 and is used by only applications as flag. E.g. it's setted 129 and is used only by applications as a flag (e.g. it's set
130 for the customized folder. 130 for the customized folder).
131 131
132 If you want to use ATTR_RO as read-only flag even for 132 If you want to use ATTR_RO as read-only flag even for
133 the directory, set this option. 133 the directory, set this option.
134 134
135errors=panic|continue|remount-ro
136 -- specify FAT behavior on critical errors: panic, continue
137 without doing anything or remount the partition in
138 read-only mode (default behavior).
139
135<bool>: 0,1,yes,no,true,false 140<bool>: 0,1,yes,no,true,false
136 141
137TODO 142TODO
diff --git a/Documentation/firmware_class/README b/Documentation/firmware_class/README
index c3480aa66ba8..7eceaff63f5f 100644
--- a/Documentation/firmware_class/README
+++ b/Documentation/firmware_class/README
@@ -77,7 +77,8 @@
77 seconds for the whole load operation. 77 seconds for the whole load operation.
78 78
79 - request_firmware_nowait() is also provided for convenience in 79 - request_firmware_nowait() is also provided for convenience in
80 non-user contexts. 80 user contexts to request firmware asynchronously, but can't be called
81 in atomic contexts.
81 82
82 83
83 about in-kernel persistence: 84 about in-kernel persistence:
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644
index 000000000000..9dc1ff4fd536
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
1Futex Requeue PI
2----------------
3
4Requeueing of tasks from a non-PI futex to a PI futex requires
5special handling in order to ensure the underlying rt_mutex is never
6left without an owner if it has waiters; doing so would break the PI
7boosting logic [see rt-mutex-desgin.txt] For the purposes of
8brevity, this action will be referred to as "requeue_pi" throughout
9this document. Priority inheritance is abbreviated throughout as
10"PI".
11
12Motivation
13----------
14
15Without requeue_pi, the glibc implementation of
16pthread_cond_broadcast() must resort to waking all the tasks waiting
17on a pthread_condvar and letting them try to sort out which task
18gets to run first in classic thundering-herd formation. An ideal
19implementation would wake the highest-priority waiter, and leave the
20rest to the natural wakeup inherent in unlocking the mutex
21associated with the condvar.
22
23Consider the simplified glibc calls:
24
25/* caller must lock mutex */
26pthread_cond_wait(cond, mutex)
27{
28 lock(cond->__data.__lock);
29 unlock(mutex);
30 do {
31 unlock(cond->__data.__lock);
32 futex_wait(cond->__data.__futex);
33 lock(cond->__data.__lock);
34 } while(...)
35 unlock(cond->__data.__lock);
36 lock(mutex);
37}
38
39pthread_cond_broadcast(cond)
40{
41 lock(cond->__data.__lock);
42 unlock(cond->__data.__lock);
43 futex_requeue(cond->data.__futex, cond->mutex);
44}
45
46Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
47has waiters. Note that pthread_cond_wait() attempts to lock the
48mutex only after it has returned to user space. This will leave the
49underlying rt_mutex with waiters, and no owner, breaking the
50previously mentioned PI-boosting algorithms.
51
52In order to support PI-aware pthread_condvar's, the kernel needs to
53be able to requeue tasks to PI futexes. This support implies that
54upon a successful futex_wait system call, the caller would return to
55user space already holding the PI futex. The glibc implementation
56would be modified as follows:
57
58
59/* caller must lock mutex */
60pthread_cond_wait_pi(cond, mutex)
61{
62 lock(cond->__data.__lock);
63 unlock(mutex);
64 do {
65 unlock(cond->__data.__lock);
66 futex_wait_requeue_pi(cond->__data.__futex);
67 lock(cond->__data.__lock);
68 } while(...)
69 unlock(cond->__data.__lock);
70 /* the kernel acquired the the mutex for us */
71}
72
73pthread_cond_broadcast_pi(cond)
74{
75 lock(cond->__data.__lock);
76 unlock(cond->__data.__lock);
77 futex_requeue_pi(cond->data.__futex, cond->mutex);
78}
79
80The actual glibc implementation will likely test for PI and make the
81necessary changes inside the existing calls rather than creating new
82calls for the PI cases. Similar changes are needed for
83pthread_cond_timedwait() and pthread_cond_signal().
84
85Implementation
86--------------
87
88In order to ensure the rt_mutex has an owner if it has waiters, it
89is necessary for both the requeue code, as well as the waiting code,
90to be able to acquire the rt_mutex before returning to user space.
91The requeue code cannot simply wake the waiter and leave it to
92acquire the rt_mutex as it would open a race window between the
93requeue call returning to user space and the waiter waking and
94starting to run. This is especially true in the uncontended case.
95
96The solution involves two new rt_mutex helper routines,
97rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
98allow the requeue code to acquire an uncontended rt_mutex on behalf
99of the waiter and to enqueue the waiter on a contended rt_mutex.
100Two new system calls provide the kernel<->user interface to
101requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
102
103FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
104and pthread_cond_timedwait()) to block on the initial futex and wait
105to be requeued to a PI-aware futex. The implementation is the
106result of a high-speed collision between futex_wait() and
107futex_lock_pi(), with some extra logic to check for the additional
108wake-up scenarios.
109
110FUTEX_REQUEUE_CMP_PI is called by the waker
111(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
112possibly wake the waiting tasks. Internally, this system call is
113still handled by futex_requeue (by passing requeue_pi=1). Before
114requeueing, futex_requeue() attempts to acquire the requeue target
115PI futex on behalf of the top waiter. If it can, this waiter is
116woken. futex_requeue() then proceeds to requeue the remaining
117nr_wake+nr_requeue tasks to the PI futex, calling
118rt_mutex_start_proxy_lock() prior to each requeue to prepare the
119task as a waiter on the underlying rt_mutex. It is possible that
120the lock can be acquired at this stage as well, if so, the next
121waiter is woken to finish the acquisition of the lock.
122
123FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
124their sum is all that really matters. futex_requeue() will wake or
125requeue up to nr_wake + nr_requeue tasks. It will wake only as many
126tasks as it can acquire the lock for, which in the majority of cases
127should be 0 as good programming practice dictates that the caller of
128either pthread_cond_broadcast() or pthread_cond_signal() acquire the
129mutex prior to making the call. FUTEX_REQUEUE_PI requires that
130nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for
131signal.
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 145c25a170c7..e4b6985044a2 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -458,7 +458,7 @@ debugfs interface, since it provides control over GPIO direction and
458value instead of just showing a gpio state summary. Plus, it could be 458value instead of just showing a gpio state summary. Plus, it could be
459present on production systems without debugging support. 459present on production systems without debugging support.
460 460
461Given approprate hardware documentation for the system, userspace could 461Given appropriate hardware documentation for the system, userspace could
462know for example that GPIO #23 controls the write protect line used to 462know for example that GPIO #23 controls the write protect line used to
463protect boot loader segments in flash memory. System upgrade procedures 463protect boot loader segments in flash memory. System upgrade procedures
464may need to temporarily remove that protection, first importing a GPIO, 464may need to temporarily remove that protection, first importing a GPIO,
diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg
index a8321267b5b6..bee4c30bc1e2 100644
--- a/Documentation/hwmon/f71882fg
+++ b/Documentation/hwmon/f71882fg
@@ -2,14 +2,18 @@ Kernel driver f71882fg
2====================== 2======================
3 3
4Supported chips: 4Supported chips:
5 * Fintek F71882FG and F71883FG 5 * Fintek F71858FG
6 Prefix: 'f71882fg' 6 Prefix: 'f71858fg'
7 Addresses scanned: none, address read from Super I/O config space 7 Addresses scanned: none, address read from Super I/O config space
8 Datasheet: Available from the Fintek website 8 Datasheet: Available from the Fintek website
9 * Fintek F71862FG and F71863FG 9 * Fintek F71862FG and F71863FG
10 Prefix: 'f71862fg' 10 Prefix: 'f71862fg'
11 Addresses scanned: none, address read from Super I/O config space 11 Addresses scanned: none, address read from Super I/O config space
12 Datasheet: Available from the Fintek website 12 Datasheet: Available from the Fintek website
13 * Fintek F71882FG and F71883FG
14 Prefix: 'f71882fg'
15 Addresses scanned: none, address read from Super I/O config space
16 Datasheet: Available from the Fintek website
13 * Fintek F8000 17 * Fintek F8000
14 Prefix: 'f8000' 18 Prefix: 'f8000'
15 Addresses scanned: none, address read from Super I/O config space 19 Addresses scanned: none, address read from Super I/O config space
@@ -66,13 +70,13 @@ printed when loading the driver.
66 70
67Three different fan control modes are supported; the mode number is written 71Three different fan control modes are supported; the mode number is written
68to the pwm#_enable file. Note that not all modes are supported on all 72to the pwm#_enable file. Note that not all modes are supported on all
69chips, and some modes may only be available in RPM / PWM mode on the F8000. 73chips, and some modes may only be available in RPM / PWM mode.
70Writing an unsupported mode will result in an invalid parameter error. 74Writing an unsupported mode will result in an invalid parameter error.
71 75
72* 1: Manual mode 76* 1: Manual mode
73 You ask for a specific PWM duty cycle / DC voltage or a specific % of 77 You ask for a specific PWM duty cycle / DC voltage or a specific % of
74 fan#_full_speed by writing to the pwm# file. This mode is only 78 fan#_full_speed by writing to the pwm# file. This mode is only
75 available on the F8000 if the fan channel is in RPM mode. 79 available on the F71858FG / F8000 if the fan channel is in RPM mode.
76 80
77* 2: Normal auto mode 81* 2: Normal auto mode
78 You can define a number of temperature/fan speed trip points, which % the 82 You can define a number of temperature/fan speed trip points, which % the
diff --git a/Documentation/hwmon/ibmaem b/Documentation/hwmon/ibmaem
index e98bdfea3467..1e0d59e000b4 100644
--- a/Documentation/hwmon/ibmaem
+++ b/Documentation/hwmon/ibmaem
@@ -7,7 +7,7 @@ henceforth as AEM.
7Supported systems: 7Supported systems:
8 * Any recent IBM System X server with AEM support. 8 * Any recent IBM System X server with AEM support.
9 This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2, 9 This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2,
10 x3950 M2, and certain HS2x/LS2x/QS2x blades. The IPMI host interface 10 x3950 M2, and certain HC10/HS2x/LS2x/QS2x blades. The IPMI host interface
11 driver ("ipmi-si") needs to be loaded for this driver to do anything. 11 driver ("ipmi-si") needs to be loaded for this driver to do anything.
12 Prefix: 'ibmaem' 12 Prefix: 'ibmaem'
13 Datasheet: Not available 13 Datasheet: Not available
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 2f10ce6a879f..dcbd502c8792 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -70,6 +70,7 @@ are interpreted as 0! For more on how written strings are interpreted see the
70[0-*] denotes any positive number starting from 0 70[0-*] denotes any positive number starting from 0
71[1-*] denotes any positive number starting from 1 71[1-*] denotes any positive number starting from 1
72RO read only value 72RO read only value
73WO write only value
73RW read/write value 74RW read/write value
74 75
75Read/write values may be read-only for some chips, depending on the 76Read/write values may be read-only for some chips, depending on the
@@ -150,6 +151,11 @@ fan[1-*]_min Fan minimum value
150 Unit: revolution/min (RPM) 151 Unit: revolution/min (RPM)
151 RW 152 RW
152 153
154fan[1-*]_max Fan maximum value
155 Unit: revolution/min (RPM)
156 Only rarely supported by the hardware.
157 RW
158
153fan[1-*]_input Fan input value. 159fan[1-*]_input Fan input value.
154 Unit: revolution/min (RPM) 160 Unit: revolution/min (RPM)
155 RO 161 RO
@@ -290,6 +296,24 @@ temp[1-*]_label Suggested temperature channel label.
290 user-space. 296 user-space.
291 RO 297 RO
292 298
299temp[1-*]_lowest
300 Historical minimum temperature
301 Unit: millidegree Celsius
302 RO
303
304temp[1-*]_highest
305 Historical maximum temperature
306 Unit: millidegree Celsius
307 RO
308
309temp[1-*]_reset_history
310 Reset temp_lowest and temp_highest
311 WO
312
313temp_reset_history
314 Reset temp_lowest and temp_highest for all sensors
315 WO
316
293Some chips measure temperature using external thermistors and an ADC, and 317Some chips measure temperature using external thermistors and an ADC, and
294report the temperature measurement as a voltage. Converting this voltage 318report the temperature measurement as a voltage. Converting this voltage
295back to a temperature (or the other way around for limits) requires 319back to a temperature (or the other way around for limits) requires
@@ -390,6 +414,7 @@ OR
390in[0-*]_min_alarm 414in[0-*]_min_alarm
391in[0-*]_max_alarm 415in[0-*]_max_alarm
392fan[1-*]_min_alarm 416fan[1-*]_min_alarm
417fan[1-*]_max_alarm
393temp[1-*]_min_alarm 418temp[1-*]_min_alarm
394temp[1-*]_max_alarm 419temp[1-*]_max_alarm
395temp[1-*]_crit_alarm 420temp[1-*]_crit_alarm
diff --git a/Documentation/hwmon/tmp401 b/Documentation/hwmon/tmp401
new file mode 100644
index 000000000000..9fc447249212
--- /dev/null
+++ b/Documentation/hwmon/tmp401
@@ -0,0 +1,42 @@
1Kernel driver tmp401
2====================
3
4Supported chips:
5 * Texas Instruments TMP401
6 Prefix: 'tmp401'
7 Addresses scanned: I2C 0x4c
8 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp401.html
9 * Texas Instruments TMP411
10 Prefix: 'tmp411'
11 Addresses scanned: I2C 0x4c
12 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp411.html
13
14Authors:
15 Hans de Goede <hdegoede@redhat.com>
16 Andre Prendel <andre.prendel@gmx.de>
17
18Description
19-----------
20
21This driver implements support for Texas Instruments TMP401 and
22TMP411 chips. These chips implements one remote and one local
23temperature sensor. Temperature is measured in degrees
24Celsius. Resolution of the remote sensor is 0.0625 degree. Local
25sensor resolution can be set to 0.5, 0.25, 0.125 or 0.0625 degree (not
26supported by the driver so far, so using the default resolution of 0.5
27degree).
28
29The driver provides the common sysfs-interface for temperatures (see
30/Documentation/hwmon/sysfs-interface under Temperatures).
31
32The TMP411 chip is compatible with TMP401. It provides some additional
33features.
34
35* Minimum and Maximum temperature measured since power-on, chip-reset
36
37 Exported via sysfs attributes tempX_lowest and tempX_highest.
38
39* Reset of historical minimum/maximum temperature measurements
40
41 Exported via sysfs attribute temp_reset_history. Writing 1 to this
42 file triggers a reset.
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index b6eb59384bb3..02b74899edaf 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -12,6 +12,10 @@ Supported chips:
12 Addresses scanned: ISA address retrieved from Super I/O registers 12 Addresses scanned: ISA address retrieved from Super I/O registers
13 Datasheet: 13 Datasheet:
14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf 14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf
15 * Winbond W83627DHG-P
16 Prefix: 'w83627dhg'
17 Addresses scanned: ISA address retrieved from Super I/O registers
18 Datasheet: not available
15 * Winbond W83667HG 19 * Winbond W83667HG
16 Prefix: 'w83667hg' 20 Prefix: 'w83667hg'
17 Addresses scanned: ISA address retrieved from Super I/O registers 21 Addresses scanned: ISA address retrieved from Super I/O registers
@@ -28,8 +32,8 @@ Description
28----------- 32-----------
29 33
30This driver implements support for the Winbond W83627EHF, W83627EHG, 34This driver implements support for the Winbond W83627EHF, W83627EHG,
31W83627DHG and W83667HG super I/O chips. We will refer to them collectively 35W83627DHG, W83627DHG-P and W83667HG super I/O chips. We will refer to them
32as Winbond chips. 36collectively as Winbond chips.
33 37
34The chips implement three temperature sensors, five fan rotation 38The chips implement three temperature sensors, five fan rotation
35speed sensors, ten analog voltage sensors (only nine for the 627DHG), one 39speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
@@ -135,3 +139,6 @@ done in the driver for all register addresses.
135The DHG also supports PECI, where the DHG queries Intel CPU temperatures, and 139The DHG also supports PECI, where the DHG queries Intel CPU temperatures, and
136the ICH8 southbridge gets that data via PECI from the DHG, so that the 140the ICH8 southbridge gets that data via PECI from the DHG, so that the
137southbridge drives the fans. And the DHG supports SST, a one-wire serial bus. 141southbridge drives the fans. And the DHG supports SST, a one-wire serial bus.
142
143The DHG-P has an additional automatic fan speed control mode named Smart Fan
144(TM) III+. This mode is not yet supported by the driver.
diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
index cfcebb10d14e..c269aaa2f26a 100644
--- a/Documentation/i2c/busses/i2c-ocores
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -20,6 +20,8 @@ platform_device with the base address and interrupt number. The
20dev.platform_data of the device should also point to a struct 20dev.platform_data of the device should also point to a struct
21ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the 21ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
22distance between registers and the input clock speed. 22distance between registers and the input clock speed.
23There is also a possibility to attach a list of i2c_board_info which
24the i2c-ocores driver will add to the bus upon creation.
23 25
24E.G. something like: 26E.G. something like:
25 27
@@ -36,9 +38,24 @@ static struct resource ocores_resources[] = {
36 }, 38 },
37}; 39};
38 40
41/* optional board info */
42struct i2c_board_info ocores_i2c_board_info[] = {
43 {
44 I2C_BOARD_INFO("tsc2003", 0x48),
45 .platform_data = &tsc2003_platform_data,
46 .irq = TSC_IRQ
47 },
48 {
49 I2C_BOARD_INFO("adv7180", 0x42 >> 1),
50 .irq = ADV_IRQ
51 }
52};
53
39static struct ocores_i2c_platform_data myi2c_data = { 54static struct ocores_i2c_platform_data myi2c_data = {
40 .regstep = 2, /* two bytes between registers */ 55 .regstep = 2, /* two bytes between registers */
41 .clock_khz = 50000, /* input clock of 50MHz */ 56 .clock_khz = 50000, /* input clock of 50MHz */
57 .devices = ocores_i2c_board_info, /* optional table of devices */
58 .num_devices = ARRAY_SIZE(ocores_i2c_board_info), /* table size */
42}; 59};
43 60
44static struct platform_device myi2c = { 61static struct platform_device myi2c = {
diff --git a/Documentation/i2c/busses/i2c-viapro b/Documentation/i2c/busses/i2c-viapro
index 22efedf60c87..2e758b0e9456 100644
--- a/Documentation/i2c/busses/i2c-viapro
+++ b/Documentation/i2c/busses/i2c-viapro
@@ -19,6 +19,9 @@ Supported adapters:
19 * VIA Technologies, Inc. VX800/VX820 19 * VIA Technologies, Inc. VX800/VX820
20 Datasheet: available on http://linux.via.com.tw 20 Datasheet: available on http://linux.via.com.tw
21 21
22 * VIA Technologies, Inc. VX855/VX875
23 Datasheet: Availability unknown
24
22Authors: 25Authors:
23 Kyösti Mälkki <kmalkki@cc.hut.fi>, 26 Kyösti Mälkki <kmalkki@cc.hut.fi>,
24 Mark D. Studebaker <mdsxyz123@yahoo.com>, 27 Mark D. Studebaker <mdsxyz123@yahoo.com>,
@@ -53,6 +56,7 @@ Your lspci -n listing must show one of these :
53 device 1106:3287 (VT8251) 56 device 1106:3287 (VT8251)
54 device 1106:8324 (CX700) 57 device 1106:8324 (CX700)
55 device 1106:8353 (VX800/VX820) 58 device 1106:8353 (VX800/VX820)
59 device 1106:8409 (VX855/VX875)
56 60
57If none of these show up, you should look in the BIOS for settings like 61If none of these show up, you should look in the BIOS for settings like
58enable ACPI / SMBus or even USB. 62enable ACPI / SMBus or even USB.
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
index 0c78f4b1d9d9..e77bebfa7b0d 100644
--- a/Documentation/ide/ide.txt
+++ b/Documentation/ide/ide.txt
@@ -216,6 +216,8 @@ Other kernel parameters for ide_core are:
216 216
217* "noflush=[interface_number.device_number]" to disable flush requests 217* "noflush=[interface_number.device_number]" to disable flush requests
218 218
219* "nohpa=[interface_number.device_number]" to disable Host Protected Area
220
219* "noprobe=[interface_number.device_number]" to skip probing 221* "noprobe=[interface_number.device_number]" to skip probing
220 222
221* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit 223* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt
index 9f09557aea39..a12ea3b586e6 100644
--- a/Documentation/input/multi-touch-protocol.txt
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -18,8 +18,12 @@ Usage
18Anonymous finger details are sent sequentially as separate packets of ABS 18Anonymous finger details are sent sequentially as separate packets of ABS
19events. Only the ABS_MT events are recognized as part of a finger 19events. Only the ABS_MT events are recognized as part of a finger
20packet. The end of a packet is marked by calling the input_mt_sync() 20packet. The end of a packet is marked by calling the input_mt_sync()
21function, which generates a SYN_MT_REPORT event. The end of multi-touch 21function, which generates a SYN_MT_REPORT event. This instructs the
22transfer is marked by calling the usual input_sync() function. 22receiver to accept the data for the current finger and prepare to receive
23another. The end of a multi-touch transfer is marked by calling the usual
24input_sync() function. This instructs the receiver to act upon events
25accumulated since last EV_SYN/SYN_REPORT and prepare to receive a new
26set of events/packets.
23 27
24A set of ABS_MT events with the desired properties is defined. The events 28A set of ABS_MT events with the desired properties is defined. The events
25are divided into categories, to allow for partial implementation. The 29are divided into categories, to allow for partial implementation. The
@@ -27,11 +31,26 @@ minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
27ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked. If the 31ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked. If the
28device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size 32device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size
29of the approaching finger. Anisotropy and direction may be specified with 33of the approaching finger. Anisotropy and direction may be specified with
30ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. Devices with 34ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. The
31more granular information may specify general shapes as blobs, i.e., as a 35ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a
32sequence of rectangular shapes grouped together by an 36finger or a pen or something else. Devices with more granular information
33ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE may be used to specify 37may specify general shapes as blobs, i.e., as a sequence of rectangular
34whether the touching tool is a finger or a pen or something else. 38shapes grouped together by an ABS_MT_BLOB_ID. Finally, for the few devices
39that currently support it, the ABS_MT_TRACKING_ID event may be used to
40report finger tracking from hardware [5].
41
42Here is what a minimal event sequence for a two-finger touch would look
43like:
44
45 ABS_MT_TOUCH_MAJOR
46 ABS_MT_POSITION_X
47 ABS_MT_POSITION_Y
48 SYN_MT_REPORT
49 ABS_MT_TOUCH_MAJOR
50 ABS_MT_POSITION_X
51 ABS_MT_POSITION_Y
52 SYN_MT_REPORT
53 SYN_REPORT
35 54
36 55
37Event Semantics 56Event Semantics
@@ -44,24 +63,24 @@ ABS_MT_TOUCH_MAJOR
44 63
45The length of the major axis of the contact. The length should be given in 64The length of the major axis of the contact. The length should be given in
46surface units. If the surface has an X times Y resolution, the largest 65surface units. If the surface has an X times Y resolution, the largest
47possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal. 66possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal [4].
48 67
49ABS_MT_TOUCH_MINOR 68ABS_MT_TOUCH_MINOR
50 69
51The length, in surface units, of the minor axis of the contact. If the 70The length, in surface units, of the minor axis of the contact. If the
52contact is circular, this event can be omitted. 71contact is circular, this event can be omitted [4].
53 72
54ABS_MT_WIDTH_MAJOR 73ABS_MT_WIDTH_MAJOR
55 74
56The length, in surface units, of the major axis of the approaching 75The length, in surface units, of the major axis of the approaching
57tool. This should be understood as the size of the tool itself. The 76tool. This should be understood as the size of the tool itself. The
58orientation of the contact and the approaching tool are assumed to be the 77orientation of the contact and the approaching tool are assumed to be the
59same. 78same [4].
60 79
61ABS_MT_WIDTH_MINOR 80ABS_MT_WIDTH_MINOR
62 81
63The length, in surface units, of the minor axis of the approaching 82The length, in surface units, of the minor axis of the approaching
64tool. Omit if circular. 83tool. Omit if circular [4].
65 84
66The above four values can be used to derive additional information about 85The above four values can be used to derive additional information about
67the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates 86the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates
@@ -70,14 +89,17 @@ different characteristic widths [1].
70 89
71ABS_MT_ORIENTATION 90ABS_MT_ORIENTATION
72 91
73The orientation of the ellipse. The value should describe half a revolution 92The orientation of the ellipse. The value should describe a signed quarter
74clockwise around the touch center. The scale of the value is arbitrary, but 93of a revolution clockwise around the touch center. The signed value range
75zero should be returned for an ellipse aligned along the Y axis of the 94is arbitrary, but zero should be returned for a finger aligned along the Y
76surface. As an example, an index finger placed straight onto the axis could 95axis of the surface, a negative value when finger is turned to the left, and
77return zero orientation, something negative when twisted to the left, and 96a positive value when finger turned to the right. When completely aligned with
78something positive when twisted to the right. This value can be omitted if 97the X axis, the range max should be returned. Orientation can be omitted
79the touching object is circular, or if the information is not available in 98if the touching object is circular, or if the information is not available
80the kernel driver. 99in the kernel driver. Partial orientation support is possible if the device
100can distinguish between the two axis, but not (uniquely) any values in
101between. In such cases, the range of ABS_MT_ORIENTATION should be [0, 1]
102[4].
81 103
82ABS_MT_POSITION_X 104ABS_MT_POSITION_X
83 105
@@ -98,8 +120,35 @@ ABS_MT_BLOB_ID
98 120
99The BLOB_ID groups several packets together into one arbitrarily shaped 121The BLOB_ID groups several packets together into one arbitrarily shaped
100contact. This is a low-level anonymous grouping, and should not be confused 122contact. This is a low-level anonymous grouping, and should not be confused
101with the high-level contactID, explained below. Most kernel drivers will 123with the high-level trackingID [5]. Most kernel drivers will not have blob
102not have this capability, and can safely omit the event. 124capability, and can safely omit the event.
125
126ABS_MT_TRACKING_ID
127
128The TRACKING_ID identifies an initiated contact throughout its life cycle
129[5]. There are currently only a few devices that support it, so this event
130should normally be omitted.
131
132
133Event Computation
134-----------------
135
136The flora of different hardware unavoidably leads to some devices fitting
137better to the MT protocol than others. To simplify and unify the mapping,
138this section gives recipes for how to compute certain events.
139
140For devices reporting contacts as rectangular shapes, signed orientation
141cannot be obtained. Assuming X and Y are the lengths of the sides of the
142touching rectangle, here is a simple formula that retains the most
143information possible:
144
145 ABS_MT_TOUCH_MAJOR := max(X, Y)
146 ABS_MT_TOUCH_MINOR := min(X, Y)
147 ABS_MT_ORIENTATION := bool(X > Y)
148
149The range of ABS_MT_ORIENTATION should be set to [0, 1], to indicate that
150the device can distinguish between a finger along the Y axis (0) and a
151finger along the X axis (1).
103 152
104 153
105Finger Tracking 154Finger Tracking
@@ -109,14 +158,18 @@ The kernel driver should generate an arbitrary enumeration of the set of
109anonymous contacts currently on the surface. The order in which the packets 158anonymous contacts currently on the surface. The order in which the packets
110appear in the event stream is not important. 159appear in the event stream is not important.
111 160
112The process of finger tracking, i.e., to assign a unique contactID to each 161The process of finger tracking, i.e., to assign a unique trackingID to each
113initiated contact on the surface, is left to user space; preferably the 162initiated contact on the surface, is left to user space; preferably the
114multi-touch X driver [3]. In that driver, the contactID stays the same and 163multi-touch X driver [3]. In that driver, the trackingID stays the same and
115unique until the contact vanishes (when the finger leaves the surface). The 164unique until the contact vanishes (when the finger leaves the surface). The
116problem of assigning a set of anonymous fingers to a set of identified 165problem of assigning a set of anonymous fingers to a set of identified
117fingers is a euclidian bipartite matching problem at each event update, and 166fingers is a euclidian bipartite matching problem at each event update, and
118relies on a sufficiently rapid update rate. 167relies on a sufficiently rapid update rate.
119 168
169There are a few devices that support trackingID in hardware. User space can
170make use of these native identifiers to reduce bandwidth and cpu usage.
171
172
120Notes 173Notes
121----- 174-----
122 175
@@ -136,5 +189,7 @@ could be used to derive tilt.
136time of writing (April 2009), the MT protocol is not yet merged, and the 189time of writing (April 2009), the MT protocol is not yet merged, and the
137prototype implements finger matching, basic mouse support and two-finger 190prototype implements finger matching, basic mouse support and two-finger
138scrolling. The project aims at improving the quality of current multi-touch 191scrolling. The project aims at improving the quality of current multi-touch
139functionality available in the synaptics X driver, and in addition 192functionality available in the Synaptics X driver, and in addition
140implement more advanced gestures. 193implement more advanced gestures.
194[4] See the section on event computation.
195[5] See the section on finger tracking.
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index 5a2d69989a8c..f6010a536590 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -22,16 +22,11 @@ README.gigaset
22 - info on the drivers for Siemens Gigaset ISDN adapters. 22 - info on the drivers for Siemens Gigaset ISDN adapters.
23README.icn 23README.icn
24 - info on the ICN-ISDN-card and its driver. 24 - info on the ICN-ISDN-card and its driver.
25>>>>>>> 93af7aca44f0e82e67bda10a0fb73d383edcc8bd:Documentation/isdn/00-INDEX
25README.HiSax 26README.HiSax
26 - info on the HiSax driver which replaces the old teles. 27 - info on the HiSax driver which replaces the old teles.
27README.hfc-pci 28README.audio
28 - info on hfc-pci based cards. 29 - info for running audio over ISDN.
29README.pcbit
30 - info on the PCBIT-D ISDN adapter and driver.
31README.syncppp
32 - info on running Sync PPP over ISDN.
33syncPPP.FAQ
34 - frequently asked questions about running PPP over ISDN.
35README.avmb1 30README.avmb1
36 - info on driver for AVM-B1 ISDN card. 31 - info on driver for AVM-B1 ISDN card.
37README.act2000 32README.act2000
@@ -42,10 +37,28 @@ README.concap
42 - info on "CONCAP" encapsulation protocol interface used for X.25. 37 - info on "CONCAP" encapsulation protocol interface used for X.25.
43README.diversion 38README.diversion
44 - info on module for isdn diversion services. 39 - info on module for isdn diversion services.
40README.fax
41 - info for using Fax over ISDN.
42README.gigaset
43 - info on the drivers for Siemens Gigaset ISDN adapters
44README.hfc-pci
45 - info on hfc-pci based cards.
46README.hysdn
47 - info on driver for Hypercope active HYSDN cards
48README.icn
49 - info on the ICN-ISDN-card and its driver.
50README.mISDN
51 - info on the Modular ISDN subsystem (mISDN)
52README.pcbit
53 - info on the PCBIT-D ISDN adapter and driver.
45README.sc 54README.sc
46 - info on driver for Spellcaster cards. 55 - info on driver for Spellcaster cards.
56README.syncppp
57 - info on running Sync PPP over ISDN.
47README.x25 58README.x25
48 - info for running X.25 over ISDN. 59 - info for running X.25 over ISDN.
60syncPPP.FAQ
61 - frequently asked questions about running PPP over ISDN.
49README.hysdn 62README.hysdn
50 - info on driver for Hypercope active HYSDN cards 63 - info on driver for Hypercope active HYSDN cards
51README.mISDN 64README.mISDN
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
index 786d619b36e5..686e107923ec 100644
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -45,7 +45,7 @@ From then on, Kernel CAPI may call the registered callback functions for the
45device. 45device.
46 46
47If the device becomes unusable for any reason (shutdown, disconnect ...), the 47If the device becomes unusable for any reason (shutdown, disconnect ...), the
48driver has to call capi_ctr_reseted(). This will prevent further calls to the 48driver has to call capi_ctr_down(). This will prevent further calls to the
49callback functions by Kernel CAPI. 49callback functions by Kernel CAPI.
50 50
51 51
@@ -114,20 +114,36 @@ char *driver_name
114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata) 114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
115 (optional) pointer to a callback function for sending firmware and 115 (optional) pointer to a callback function for sending firmware and
116 configuration data to the device 116 configuration data to the device
117 Return value: 0 on success, error code on error
118 Called in process context.
117 119
118void (*reset_ctr)(struct capi_ctr *ctrlr) 120void (*reset_ctr)(struct capi_ctr *ctrlr)
119 pointer to a callback function for performing a reset on the device, 121 (optional) pointer to a callback function for performing a reset on
120 releasing all registered applications 122 the device, releasing all registered applications
123 Called in process context.
121 124
122void (*register_appl)(struct capi_ctr *ctrlr, u16 applid, 125void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
123 capi_register_params *rparam) 126 capi_register_params *rparam)
124void (*release_appl)(struct capi_ctr *ctrlr, u16 applid) 127void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
125 pointers to callback functions for registration and deregistration of 128 pointers to callback functions for registration and deregistration of
126 applications with the device 129 applications with the device
130 Calls to these functions are serialized by Kernel CAPI so that only
131 one call to any of them is active at any time.
127 132
128u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb) 133u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
129 pointer to a callback function for sending a CAPI message to the 134 pointer to a callback function for sending a CAPI message to the
130 device 135 device
136 Return value: CAPI error code
137 If the method returns 0 (CAPI_NOERROR) the driver has taken ownership
138 of the skb and the caller may no longer access it. If it returns a
139 non-zero (error) value then ownership of the skb returns to the caller
140 who may reuse or free it.
141 The return value should only be used to signal problems with respect
142 to accepting or queueing the message. Errors occurring during the
143 actual processing of the message should be signaled with an
144 appropriate reply message.
145 Calls to this function are not serialized by Kernel CAPI, ie. it must
146 be prepared to be re-entered.
131 147
132char *(*procinfo)(struct capi_ctr *ctrlr) 148char *(*procinfo)(struct capi_ctr *ctrlr)
133 pointer to a callback function returning the entry for the device in 149 pointer to a callback function returning the entry for the device in
@@ -138,6 +154,8 @@ read_proc_t *ctr_read_proc
138 system entry, /proc/capi/controllers/<n>; will be called with a 154 system entry, /proc/capi/controllers/<n>; will be called with a
139 pointer to the device's capi_ctr structure as the last (data) argument 155 pointer to the device's capi_ctr structure as the last (data) argument
140 156
157Note: Callback functions are never called in interrupt context.
158
141- to be filled in before calling capi_ctr_ready(): 159- to be filled in before calling capi_ctr_ready():
142 160
143u8 manu[CAPI_MANUFACTURER_LEN] 161u8 manu[CAPI_MANUFACTURER_LEN]
@@ -153,6 +171,45 @@ u8 serial[CAPI_SERIAL_LEN]
153 value to return for CAPI_GET_SERIAL 171 value to return for CAPI_GET_SERIAL
154 172
155 173
1744.3 The _cmsg Structure
175
176(declared in <linux/isdn/capiutil.h>)
177
178The _cmsg structure stores the contents of a CAPI 2.0 message in an easily
179accessible form. It contains members for all possible CAPI 2.0 parameters, of
180which only those appearing in the message type currently being processed are
181actually used. Unused members should be set to zero.
182
183Members are named after the CAPI 2.0 standard names of the parameters they
184represent. See <linux/isdn/capiutil.h> for the exact spelling. Member data
185types are:
186
187u8 for CAPI parameters of type 'byte'
188
189u16 for CAPI parameters of type 'word'
190
191u32 for CAPI parameters of type 'dword'
192
193_cstruct for CAPI parameters of type 'struct' not containing any
194 variably-sized (struct) subparameters (eg. 'Called Party Number')
195 The member is a pointer to a buffer containing the parameter in
196 CAPI encoding (length + content). It may also be NULL, which will
197 be taken to represent an empty (zero length) parameter.
198
199_cmstruct for CAPI parameters of type 'struct' containing 'struct'
200 subparameters ('Additional Info' and 'B Protocol')
201 The representation is a single byte containing one of the values:
202 CAPI_DEFAULT: the parameter is empty
203 CAPI_COMPOSE: the values of the subparameters are stored
204 individually in the corresponding _cmsg structure members
205
206Functions capi_cmsg2message() and capi_message2cmsg() are provided to convert
207messages between their transport encoding described in the CAPI 2.0 standard
208and their _cmsg structure representation. Note that capi_cmsg2message() does
209not know or check the size of its destination buffer. The caller must make
210sure it is big enough to accomodate the resulting CAPI message.
211
212
1565. Lower Layer Interface Functions 2135. Lower Layer Interface Functions
157 214
158(declared in <linux/isdn/capilli.h>) 215(declared in <linux/isdn/capilli.h>)
@@ -166,7 +223,7 @@ int detach_capi_ctr(struct capi_ctr *ctrlr)
166 register/unregister a device (controller) with Kernel CAPI 223 register/unregister a device (controller) with Kernel CAPI
167 224
168void capi_ctr_ready(struct capi_ctr *ctrlr) 225void capi_ctr_ready(struct capi_ctr *ctrlr)
169void capi_ctr_reseted(struct capi_ctr *ctrlr) 226void capi_ctr_down(struct capi_ctr *ctrlr)
170 signal controller ready/not ready 227 signal controller ready/not ready
171 228
172void capi_ctr_suspend_output(struct capi_ctr *ctrlr) 229void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
@@ -211,3 +268,32 @@ CAPIMSG_CONTROL(m) CAPIMSG_SETCONTROL(m, contr) Controller/PLCI/NCCI
211 (u32) 268 (u32)
212CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16) 269CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16)
213 270
271
272Library functions for working with _cmsg structures
273(from <linux/isdn/capiutil.h>):
274
275unsigned capi_cmsg2message(_cmsg *cmsg, u8 *msg)
276 Assembles a CAPI 2.0 message from the parameters in *cmsg, storing the
277 result in *msg.
278
279unsigned capi_message2cmsg(_cmsg *cmsg, u8 *msg)
280 Disassembles the CAPI 2.0 message in *msg, storing the parameters in
281 *cmsg.
282
283unsigned capi_cmsg_header(_cmsg *cmsg, u16 ApplId, u8 Command, u8 Subcommand,
284 u16 Messagenumber, u32 Controller)
285 Fills the header part and address field of the _cmsg structure *cmsg
286 with the given values, zeroing the remainder of the structure so only
287 parameters with non-default values need to be changed before sending
288 the message.
289
290void capi_cmsg_answer(_cmsg *cmsg)
291 Sets the low bit of the Subcommand field in *cmsg, thereby converting
292 _REQ to _CONF and _IND to _RESP.
293
294char *capi_cmd2str(u8 Command, u8 Subcommand)
295 Returns the CAPI 2.0 message name corresponding to the given command
296 and subcommand values, as a static ASCII string. The return value may
297 be NULL if the command/subcommand is not one of those defined in the
298 CAPI 2.0 standard.
299
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 02c0e9341dd8..f9963103ae3d 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -149,10 +149,8 @@ GigaSet 307x Device Driver
149 configuration files and chat scripts in the gigaset-VERSION/ppp directory 149 configuration files and chat scripts in the gigaset-VERSION/ppp directory
150 in the driver packages from http://sourceforge.net/projects/gigaset307x/. 150 in the driver packages from http://sourceforge.net/projects/gigaset307x/.
151 Please note that the USB drivers are not able to change the state of the 151 Please note that the USB drivers are not able to change the state of the
152 control lines (the M105 driver can be configured to use some undocumented 152 control lines. This means you must use "Stupid Mode" if you are using
153 control requests, if you really need the control lines, though). This means 153 wvdial or you should use the nocrtscts option of pppd.
154 you must use "Stupid Mode" if you are using wvdial or you should use the
155 nocrtscts option of pppd.
156 You must also assure that the ppp_async module is loaded with the parameter 154 You must also assure that the ppp_async module is loaded with the parameter
157 flag_time=0. You can do this e.g. by adding a line like 155 flag_time=0. You can do this e.g. by adding a line like
158 156
@@ -190,20 +188,19 @@ GigaSet 307x Device Driver
190 You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode 188 You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode
191 setting (ttyGxy is ttyGU0 or ttyGB0). 189 setting (ttyGxy is ttyGU0 or ttyGB0).
192 190
1932.6. M105 Undocumented USB Requests 1912.6. Unregistered Wireless Devices (M101/M105)
194 ------------------------------ 192 -----------------------------------------
195 193 The main purpose of the ser_gigaset and usb_gigaset drivers is to allow
196 The Gigaset M105 USB data box understands a couple of useful, but 194 the M101 and M105 wireless devices to be used as ISDN devices for ISDN
197 undocumented USB commands. These requests are not used in normal 195 connections through a Gigaset base. Therefore they assume that the device
198 operation (for wireless access to the base), but are needed for access 196 is registered to a DECT base.
199 to the M105's own configuration mode (registration to the base, baudrate 197
200 and line format settings, device status queries) via the gigacontr 198 If the M101/M105 device is not registered to a base, initialization of
201 utility. Their use is controlled by the kernel configuration option 199 the device fails, and a corresponding error message is logged by the
202 "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you 200 driver. In that situation, a restricted set of functions is available
203 encounter error code -ENOTTY when trying to use some features of the 201 which includes, in particular, those necessary for registering the device
204 M105, try setting that option to "y" via 'make {x,menu}config' and 202 to a base or for switching it between Fixed Part and Portable Part
205 recompiling the driver. 203 modes.
206
207 204
2083. Troubleshooting 2053. Troubleshooting
209 --------------- 206 ---------------
@@ -234,11 +231,12 @@ GigaSet 307x Device Driver
234 Select Unimodem mode for all DECT data adapters. (see section 2.4.) 231 Select Unimodem mode for all DECT data adapters. (see section 2.4.)
235 232
236 Problem: 233 Problem:
237 You want to configure your USB DECT data adapter (M105) but gigacontr 234 Messages like this:
238 reports an error: "/dev/ttyGU0: Inappropriate ioctl for device". 235 usb_gigaset 3-2:1.0: Could not initialize the device.
236 appear in your syslog.
239 Solution: 237 Solution:
240 Recompile the usb_gigaset driver with the kernel configuration option 238 Check whether your M10x wireless device is correctly registered to the
241 CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.) 239 Gigaset base. (see section 2.6.)
242 240
2433.2. Telling the driver to provide more information 2413.2. Telling the driver to provide more information
244 ---------------------------------------------- 242 ----------------------------------------------
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 26a7c0a93193..849b5e56d06f 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -35,48 +35,26 @@ new .config files to see the differences:
35 35
36(Yes, we need something better here.) 36(Yes, we need something better here.)
37 37
38
39======================================================================
40menuconfig
41--------------------------------------------------
42
43SEARCHING for CONFIG symbols
44
45Searching in menuconfig:
46
47 The Search function searches for kernel configuration symbol
48 names, so you have to know something close to what you are
49 looking for.
50
51 Example:
52 /hotplug
53 This lists all config symbols that contain "hotplug",
54 e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG.
55
56 For search help, enter / followed TAB-TAB-TAB (to highlight
57 <Help>) and Enter. This will tell you that you can also use
58 regular expressions (regexes) in the search string, so if you
59 are not interested in MEMORY_HOTPLUG, you could try
60
61 /^hotplug
62
63
64______________________________________________________________________ 38______________________________________________________________________
65Color Themes for 'menuconfig' 39Environment variables for '*config'
66 40
67It is possible to select different color themes using the variable 41KCONFIG_CONFIG
68MENUCONFIG_COLOR. To select a theme use: 42--------------------------------------------------
43This environment variable can be used to specify a default kernel config
44file name to override the default name of ".config".
69 45
70 make MENUCONFIG_COLOR=<theme> menuconfig 46KCONFIG_OVERWRITECONFIG
47--------------------------------------------------
48If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
49break symlinks when .config is a symlink to somewhere else.
71 50
72Available themes are: 51KCONFIG_NOTIMESTAMP
73 mono => selects colors suitable for monochrome displays 52--------------------------------------------------
74 blackbg => selects a color scheme with black background 53If this environment variable exists and is non-null, the timestamp line
75 classic => theme with blue background. The classic look 54in generated .config files is omitted.
76 bluetitle => a LCD friendly version of classic. (default)
77 55
78______________________________________________________________________ 56______________________________________________________________________
79Environment variables in 'menuconfig' 57Environment variables for '{allyes/allmod/allno/rand}config'
80 58
81KCONFIG_ALLCONFIG 59KCONFIG_ALLCONFIG
82-------------------------------------------------- 60--------------------------------------------------
@@ -95,8 +73,7 @@ values.
95This enables you to create "miniature" config (miniconfig) or custom 73This enables you to create "miniature" config (miniconfig) or custom
96config files containing just the config symbols that you are interested 74config files containing just the config symbols that you are interested
97in. Then the kernel config system generates the full .config file, 75in. Then the kernel config system generates the full .config file,
98including dependencies of your miniconfig file, based on the miniconfig 76including symbols of your miniconfig file.
99file.
100 77
101This 'KCONFIG_ALLCONFIG' file is a config file which contains 78This 'KCONFIG_ALLCONFIG' file is a config file which contains
102(usually a subset of all) preset config symbols. These variable 79(usually a subset of all) preset config symbols. These variable
@@ -113,26 +90,14 @@ These examples will disable most options (allnoconfig) but enable or
113disable the options that are explicitly listed in the specified 90disable the options that are explicitly listed in the specified
114mini-config files. 91mini-config files.
115 92
93______________________________________________________________________
94Environment variables for 'silentoldconfig'
95
116KCONFIG_NOSILENTUPDATE 96KCONFIG_NOSILENTUPDATE
117-------------------------------------------------- 97--------------------------------------------------
118If this variable has a non-blank value, it prevents silent kernel 98If this variable has a non-blank value, it prevents silent kernel
119config udpates (requires explicit updates). 99config udpates (requires explicit updates).
120 100
121KCONFIG_CONFIG
122--------------------------------------------------
123This environment variable can be used to specify a default kernel config
124file name to override the default name of ".config".
125
126KCONFIG_OVERWRITECONFIG
127--------------------------------------------------
128If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
129break symlinks when .config is a symlink to somewhere else.
130
131KCONFIG_NOTIMESTAMP
132--------------------------------------------------
133If this environment variable exists and is non-null, the timestamp line
134in generated .config files is omitted.
135
136KCONFIG_AUTOCONFIG 101KCONFIG_AUTOCONFIG
137-------------------------------------------------- 102--------------------------------------------------
138This environment variable can be set to specify the path & name of the 103This environment variable can be set to specify the path & name of the
@@ -143,15 +108,54 @@ KCONFIG_AUTOHEADER
143This environment variable can be set to specify the path & name of the 108This environment variable can be set to specify the path & name of the
144"autoconf.h" (header) file. Its default value is "include/linux/autoconf.h". 109"autoconf.h" (header) file. Its default value is "include/linux/autoconf.h".
145 110
111
112======================================================================
113menuconfig
114--------------------------------------------------
115
116SEARCHING for CONFIG symbols
117
118Searching in menuconfig:
119
120 The Search function searches for kernel configuration symbol
121 names, so you have to know something close to what you are
122 looking for.
123
124 Example:
125 /hotplug
126 This lists all config symbols that contain "hotplug",
127 e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG.
128
129 For search help, enter / followed TAB-TAB-TAB (to highlight
130 <Help>) and Enter. This will tell you that you can also use
131 regular expressions (regexes) in the search string, so if you
132 are not interested in MEMORY_HOTPLUG, you could try
133
134 /^hotplug
135
146______________________________________________________________________ 136______________________________________________________________________
147menuconfig User Interface Options 137User interface options for 'menuconfig'
148---------------------------------------------------------------------- 138
139MENUCONFIG_COLOR
140--------------------------------------------------
141It is possible to select different color themes using the variable
142MENUCONFIG_COLOR. To select a theme use:
143
144 make MENUCONFIG_COLOR=<theme> menuconfig
145
146Available themes are:
147 mono => selects colors suitable for monochrome displays
148 blackbg => selects a color scheme with black background
149 classic => theme with blue background. The classic look
150 bluetitle => a LCD friendly version of classic. (default)
151
149MENUCONFIG_MODE 152MENUCONFIG_MODE
150-------------------------------------------------- 153--------------------------------------------------
151This mode shows all sub-menus in one large tree. 154This mode shows all sub-menus in one large tree.
152 155
153Example: 156Example:
154 MENUCONFIG_MODE=single_menu make menuconfig 157 make MENUCONFIG_MODE=single_menu menuconfig
158
155 159
156====================================================================== 160======================================================================
157xconfig 161xconfig
diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
index b1096da953c8..0767cf69c69e 100644
--- a/Documentation/kbuild/modules.txt
+++ b/Documentation/kbuild/modules.txt
@@ -275,7 +275,7 @@ following files:
275 275
276 KERNELDIR := /lib/modules/`uname -r`/build 276 KERNELDIR := /lib/modules/`uname -r`/build
277 all:: 277 all::
278 $(MAKE) -C $KERNELDIR M=`pwd` $@ 278 $(MAKE) -C $(KERNELDIR) M=`pwd` $@
279 279
280 # Module specific targets 280 # Module specific targets
281 genbin: 281 genbin:
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 3f4bc840da8b..cab61d842259 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -108,7 +108,7 @@ There are two possible methods of using Kdump.
108 108
1092) Or use the system kernel binary itself as dump-capture kernel and there is 1092) Or use the system kernel binary itself as dump-capture kernel and there is
110 no need to build a separate dump-capture kernel. This is possible 110 no need to build a separate dump-capture kernel. This is possible
111 only with the architecutres which support a relocatable kernel. As 111 only with the architectures which support a relocatable kernel. As
112 of today, i386, x86_64, ppc64 and ia64 architectures support relocatable 112 of today, i386, x86_64, ppc64 and ia64 architectures support relocatable
113 kernel. 113 kernel.
114 114
@@ -222,7 +222,7 @@ Dump-capture kernel config options (Arch Dependent, ia64)
222---------------------------------------------------------- 222----------------------------------------------------------
223 223
224- No specific options are required to create a dump-capture kernel 224- No specific options are required to create a dump-capture kernel
225 for ia64, other than those specified in the arch idependent section 225 for ia64, other than those specified in the arch independent section
226 above. This means that it is possible to use the system kernel 226 above. This means that it is possible to use the system kernel
227 as a dump-capture kernel if desired. 227 as a dump-capture kernel if desired.
228 228
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75..5578248c18a4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,7 +56,6 @@ parameter is applicable:
56 ISAPNP ISA PnP code is enabled. 56 ISAPNP ISA PnP code is enabled.
57 ISDN Appropriate ISDN support is enabled. 57 ISDN Appropriate ISDN support is enabled.
58 JOY Appropriate joystick support is enabled. 58 JOY Appropriate joystick support is enabled.
59 KMEMTRACE kmemtrace is enabled.
60 LIBATA Libata driver is enabled 59 LIBATA Libata driver is enabled
61 LP Printer support is enabled. 60 LP Printer support is enabled.
62 LOOP Loopback device support is enabled. 61 LOOP Loopback device support is enabled.
@@ -329,11 +328,6 @@ and is between 256 and 4096 characters. It is defined in the file
329 flushed before they will be reused, which 328 flushed before they will be reused, which
330 is a lot of faster 329 is a lot of faster
331 330
332 amd_iommu_size= [HW,X86-64]
333 Define the size of the aperture for the AMD IOMMU
334 driver. Possible values are:
335 '32M', '64M' (default), '128M', '256M', '512M', '1G'
336
337 amijoy.map= [HW,JOY] Amiga joystick support 331 amijoy.map= [HW,JOY] Amiga joystick support
338 Map of devices attached to JOY0DAT and JOY1DAT 332 Map of devices attached to JOY0DAT and JOY1DAT
339 Format: <a>,<b> 333 Format: <a>,<b>
@@ -497,6 +491,13 @@ and is between 256 and 4096 characters. It is defined in the file
497 Also note the kernel might malfunction if you disable 491 Also note the kernel might malfunction if you disable
498 some critical bits. 492 some critical bits.
499 493
494 cmo_free_hint= [PPC] Format: { yes | no }
495 Specify whether pages are marked as being inactive
496 when they are freed. This is used in CMO environments
497 to determine OS memory pressure for page stealing by
498 a hypervisor.
499 Default: yes
500
500 code_bytes [X86] How many bytes of object code to print 501 code_bytes [X86] How many bytes of object code to print
501 in an oops report. 502 in an oops report.
502 Range: 0 - 8192 503 Range: 0 - 8192
@@ -545,6 +546,10 @@ and is between 256 and 4096 characters. It is defined in the file
545 console=brl,ttyS0 546 console=brl,ttyS0
546 For now, only VisioBraille is supported. 547 For now, only VisioBraille is supported.
547 548
549 consoleblank= [KNL] The console blank (screen saver) timeout in
550 seconds. Defaults to 10*60 = 10mins. A value of 0
551 disables the blank timer.
552
548 coredump_filter= 553 coredump_filter=
549 [KNL] Change the default value for 554 [KNL] Change the default value for
550 /proc/<pid>/coredump_filter. 555 /proc/<pid>/coredump_filter.
@@ -646,6 +651,13 @@ and is between 256 and 4096 characters. It is defined in the file
646 DMA-API debugging code disables itself because the 651 DMA-API debugging code disables itself because the
647 architectural default is too low. 652 architectural default is too low.
648 653
654 dma_debug_driver=<driver_name>
655 With this option the DMA-API debugging driver
656 filter feature can be enabled at boot time. Just
657 pass the driver to filter for as the parameter.
658 The filter can be disabled or changed to another
659 driver later using sysfs.
660
649 dscc4.setup= [NET] 661 dscc4.setup= [NET]
650 662
651 dtc3181e= [HW,SCSI] 663 dtc3181e= [HW,SCSI]
@@ -752,12 +764,25 @@ and is between 256 and 4096 characters. It is defined in the file
752 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. 764 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
753 765
754 ftrace=[tracer] 766 ftrace=[tracer]
755 [ftrace] will set and start the specified tracer 767 [FTRACE] will set and start the specified tracer
756 as early as possible in order to facilitate early 768 as early as possible in order to facilitate early
757 boot debugging. 769 boot debugging.
758 770
759 ftrace_dump_on_oops 771 ftrace_dump_on_oops
760 [ftrace] will dump the trace buffers on oops. 772 [FTRACE] will dump the trace buffers on oops.
773
774 ftrace_filter=[function-list]
775 [FTRACE] Limit the functions traced by the function
776 tracer at boot up. function-list is a comma separated
777 list of functions. This list can be changed at run
778 time by the set_ftrace_filter file in the debugfs
779 tracing directory.
780
781 ftrace_notrace=[function-list]
782 [FTRACE] Do not trace the functions specified in
783 function-list. This list can be changed at run time
784 by the set_ftrace_notrace file in the debugfs
785 tracing directory.
761 786
762 gamecon.map[2|3]= 787 gamecon.map[2|3]=
763 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad 788 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
@@ -873,11 +898,8 @@ and is between 256 and 4096 characters. It is defined in the file
873 898
874 ide-core.nodma= [HW] (E)IDE subsystem 899 ide-core.nodma= [HW] (E)IDE subsystem
875 Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc 900 Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
876 .vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom 901 .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
877 .chs .ignore_cable are additional options 902 .cdrom .chs .ignore_cable are additional options
878 See Documentation/ide/ide.txt.
879
880 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
881 See Documentation/ide/ide.txt. 903 See Documentation/ide/ide.txt.
882 904
883 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem 905 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
@@ -914,6 +936,12 @@ and is between 256 and 4096 characters. It is defined in the file
914 Formt: { "sha1" | "md5" } 936 Formt: { "sha1" | "md5" }
915 default: "sha1" 937 default: "sha1"
916 938
939 ima_tcb [IMA]
940 Load a policy which meets the needs of the Trusted
941 Computing Base. This means IMA will measure all
942 programs exec'd, files mmap'd for exec, and all files
943 opened for read by uid=0.
944
917 in2000= [HW,SCSI] 945 in2000= [HW,SCSI]
918 See header of drivers/scsi/in2000.c. 946 See header of drivers/scsi/in2000.c.
919 947
@@ -1054,24 +1082,19 @@ and is between 256 and 4096 characters. It is defined in the file
1054 use the HighMem zone if it exists, and the Normal 1082 use the HighMem zone if it exists, and the Normal
1055 zone if it does not. 1083 zone if it does not.
1056 1084
1057 kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
1058 Controls whether kmemtrace is enabled
1059 at boot-time.
1060
1061 kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
1062 subbufs kmemtrace's relay channel has. Set this
1063 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1064 you experience buffer overruns.
1065
1066 kgdboc= [HW] kgdb over consoles. 1085 kgdboc= [HW] kgdb over consoles.
1067 Requires a tty driver that supports console polling. 1086 Requires a tty driver that supports console polling.
1068 (only serial suported for now) 1087 (only serial supported for now)
1069 Format: <serial_device>[,baud] 1088 Format: <serial_device>[,baud]
1070 1089
1071 kmac= [MIPS] korina ethernet MAC address. 1090 kmac= [MIPS] korina ethernet MAC address.
1072 Configure the RouterBoard 532 series on-chip 1091 Configure the RouterBoard 532 series on-chip
1073 Ethernet adapter MAC address. 1092 Ethernet adapter MAC address.
1074 1093
1094 kmemleak= [KNL] Boot-time kmemleak enable/disable
1095 Valid arguments: on, off
1096 Default: on
1097
1075 kstack=N [X86] Print N words from the kernel stack 1098 kstack=N [X86] Print N words from the kernel stack
1076 in oops dumps. 1099 in oops dumps.
1077 1100
@@ -1390,7 +1413,7 @@ and is between 256 and 4096 characters. It is defined in the file
1390 ('y', default) or cooked coordinates ('n') 1413 ('y', default) or cooked coordinates ('n')
1391 1414
1392 mtrr_chunk_size=nn[KMG] [X86] 1415 mtrr_chunk_size=nn[KMG] [X86]
1393 used for mtrr cleanup. It is largest continous chunk 1416 used for mtrr cleanup. It is largest continuous chunk
1394 that could hold holes aka. UC entries. 1417 that could hold holes aka. UC entries.
1395 1418
1396 mtrr_gran_size=nn[KMG] [X86] 1419 mtrr_gran_size=nn[KMG] [X86]
@@ -1535,6 +1558,10 @@ and is between 256 and 4096 characters. It is defined in the file
1535 register save and restore. The kernel will only save 1558 register save and restore. The kernel will only save
1536 legacy floating-point registers on task switch. 1559 legacy floating-point registers on task switch.
1537 1560
1561 noxsave [BUGS=X86] Disables x86 extended register state save
1562 and restore using xsave. The kernel will fallback to
1563 enabling legacy floating-point and sse state.
1564
1538 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or 1565 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
1539 wfi(ARM) instruction doesn't work correctly and not to 1566 wfi(ARM) instruction doesn't work correctly and not to
1540 use it. This is also useful when using JTAG debugger. 1567 use it. This is also useful when using JTAG debugger.
@@ -1571,6 +1598,9 @@ and is between 256 and 4096 characters. It is defined in the file
1571 noinitrd [RAM] Tells the kernel not to load any configured 1598 noinitrd [RAM] Tells the kernel not to load any configured
1572 initial RAM disk. 1599 initial RAM disk.
1573 1600
1601 nointremap [X86-64, Intel-IOMMU] Do not enable interrupt
1602 remapping.
1603
1574 nointroute [IA-64] 1604 nointroute [IA-64]
1575 1605
1576 nojitter [IA64] Disables jitter checking for ITC timers. 1606 nojitter [IA64] Disables jitter checking for ITC timers.
@@ -1656,6 +1686,14 @@ and is between 256 and 4096 characters. It is defined in the file
1656 oprofile.timer= [HW] 1686 oprofile.timer= [HW]
1657 Use timer interrupt instead of performance counters 1687 Use timer interrupt instead of performance counters
1658 1688
1689 oprofile.cpu_type= Force an oprofile cpu type
1690 This might be useful if you have an older oprofile
1691 userland or if you want common events.
1692 Format: { archperfmon }
1693 archperfmon: [X86] Force use of architectural
1694 perfmon on Intel CPUs instead of the
1695 CPU specific event set.
1696
1659 osst= [HW,SCSI] SCSI Tape Driver 1697 osst= [HW,SCSI] SCSI Tape Driver
1660 Format: <buffer_size>,<write_threshold> 1698 Format: <buffer_size>,<write_threshold>
1661 See also Documentation/scsi/st.txt. 1699 See also Documentation/scsi/st.txt.
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 000000000000..363044609dad
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,773 @@
1GETTING STARTED WITH KMEMCHECK
2==============================
3
4Vegard Nossum <vegardno@ifi.uio.no>
5
6
7Contents
8========
90. Introduction
101. Downloading
112. Configuring and compiling
123. How to use
133.1. Booting
143.2. Run-time enable/disable
153.3. Debugging
163.4. Annotating false positives
174. Reporting errors
185. Technical description
19
20
210. Introduction
22===============
23
24kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
25is a dynamic checker that detects and warns about some uses of uninitialized
26memory.
27
28Userspace programmers might be familiar with Valgrind's memcheck. The main
29difference between memcheck and kmemcheck is that memcheck works for userspace
30programs only, and kmemcheck works for the kernel only. The implementations
31are of course vastly different. Because of this, kmemcheck is not as accurate
32as memcheck, but it turns out to be good enough in practice to discover real
33programmer errors that the compiler is not able to find through static
34analysis.
35
36Enabling kmemcheck on a kernel will probably slow it down to the extent that
37the machine will not be usable for normal workloads such as e.g. an
38interactive desktop. kmemcheck will also cause the kernel to use about twice
39as much memory as normal. For this reason, kmemcheck is strictly a debugging
40feature.
41
42
431. Downloading
44==============
45
46kmemcheck can only be downloaded using git. If you want to write patches
47against the current code, you should use the kmemcheck development branch of
48the tip tree. It is also possible to use the linux-next tree, which also
49includes the latest version of kmemcheck.
50
51Assuming that you've already cloned the linux-2.6.git repository, all you
52have to do is add the -tip tree as a remote, like this:
53
54 $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
55
56To actually download the tree, fetch the remote:
57
58 $ git fetch tip
59
60And to check out a new local branch with the kmemcheck code:
61
62 $ git checkout -b kmemcheck tip/kmemcheck
63
64General instructions for the -tip tree can be found here:
65http://people.redhat.com/mingo/tip.git/readme.txt
66
67
682. Configuring and compiling
69============================
70
71kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
72configuration variables must have specific settings in order for the kmemcheck
73menu to even appear in "menuconfig". These are:
74
75 o CONFIG_CC_OPTIMIZE_FOR_SIZE=n
76
77 This option is located under "General setup" / "Optimize for size".
78
79 Without this, gcc will use certain optimizations that usually lead to
80 false positive warnings from kmemcheck. An example of this is a 16-bit
81 field in a struct, where gcc may load 32 bits, then discard the upper
82 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
83 warning for the upper 16 bits (if they're uninitialized).
84
85 o CONFIG_SLAB=y or CONFIG_SLUB=y
86
87 This option is located under "General setup" / "Choose SLAB
88 allocator".
89
90 o CONFIG_FUNCTION_TRACER=n
91
92 This option is located under "Kernel hacking" / "Tracers" / "Kernel
93 Function Tracer"
94
95 When function tracing is compiled in, gcc emits a call to another
96 function at the beginning of every function. This means that when the
97 page fault handler is called, the ftrace framework will be called
98 before kmemcheck has had a chance to handle the fault. If ftrace then
99 modifies memory that was tracked by kmemcheck, the result is an
100 endless recursive page fault.
101
102 o CONFIG_DEBUG_PAGEALLOC=n
103
104 This option is located under "Kernel hacking" / "Debug page memory
105 allocations".
106
107In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also
108located under "Kernel hacking". With this, you will be able to get line number
109information from the kmemcheck warnings, which is extremely valuable in
110debugging a problem. This option is not mandatory, however, because it slows
111down the compilation process and produces a much bigger kernel image.
112
113Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck:
114trap use of uninitialized memory"). Here follows a description of the
115kmemcheck configuration variables:
116
117 o CONFIG_KMEMCHECK
118
119 This must be enabled in order to use kmemcheck at all...
120
121 o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT
122
123 This option controls the status of kmemcheck at boot-time. "Enabled"
124 will enable kmemcheck right from the start, "disabled" will boot the
125 kernel as normal (but with the kmemcheck code compiled in, so it can
126 be enabled at run-time after the kernel has booted), and "one-shot" is
127 a special mode which will turn kmemcheck off automatically after
128 detecting the first use of uninitialized memory.
129
130 If you are using kmemcheck to actively debug a problem, then you
131 probably want to choose "enabled" here.
132
133 The one-shot mode is mostly useful in automated test setups because it
134 can prevent floods of warnings and increase the chances of the machine
135 surviving in case something is really wrong. In other cases, the one-
136 shot mode could actually be counter-productive because it would turn
137 itself off at the very first error -- in the case of a false positive
138 too -- and this would come in the way of debugging the specific
139 problem you were interested in.
140
141 If you would like to use your kernel as normal, but with a chance to
142 enable kmemcheck in case of some problem, it might be a good idea to
143 choose "disabled" here. When kmemcheck is disabled, most of the run-
144 time overhead is not incurred, and the kernel will be almost as fast
145 as normal.
146
147 o CONFIG_KMEMCHECK_QUEUE_SIZE
148
149 Select the maximum number of error reports to store in an internal
150 (fixed-size) buffer. Since errors can occur virtually anywhere and in
151 any context, we need a temporary storage area which is guaranteed not
152 to generate any other page faults when accessed. The queue will be
153 emptied as soon as a tasklet may be scheduled. If the queue is full,
154 new error reports will be lost.
155
156 The default value of 64 is probably fine. If some code produces more
157 than 64 errors within an irqs-off section, then the code is likely to
158 produce many, many more, too, and these additional reports seldom give
159 any more information (the first report is usually the most valuable
160 anyway).
161
162 This number might have to be adjusted if you are not using serial
163 console or similar to capture the kernel log. If you are using the
164 "dmesg" command to save the log, then getting a lot of kmemcheck
165 warnings might overflow the kernel log itself, and the earlier reports
166 will get lost in that way instead. Try setting this to 10 or so on
167 such a setup.
168
169 o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT
170
171 Select the number of shadow bytes to save along with each entry of the
172 error-report queue. These bytes indicate what parts of an allocation
173 are initialized, uninitialized, etc. and will be displayed when an
174 error is detected to help the debugging of a particular problem.
175
176 The number entered here is actually the logarithm of the number of
177 bytes that will be saved. So if you pick for example 5 here, kmemcheck
178 will save 2^5 = 32 bytes.
179
180 The default value should be fine for debugging most problems. It also
181 fits nicely within 80 columns.
182
183 o CONFIG_KMEMCHECK_PARTIAL_OK
184
185 This option (when enabled) works around certain GCC optimizations that
186 produce 32-bit reads from 16-bit variables where the upper 16 bits are
187 thrown away afterwards.
188
189 The default value (enabled) is recommended. This may of course hide
190 some real errors, but disabling it would probably produce a lot of
191 false positives.
192
193 o CONFIG_KMEMCHECK_BITOPS_OK
194
195 This option silences warnings that would be generated for bit-field
196 accesses where not all the bits are initialized at the same time. This
197 may also hide some real bugs.
198
199 This option is probably obsolete, or it should be replaced with
200 the kmemcheck-/bitfield-annotations for the code in question. The
201 default value is therefore fine.
202
203Now compile the kernel as usual.
204
205
2063. How to use
207=============
208
2093.1. Booting
210============
211
212First some information about the command-line options. There is only one
213option specific to kmemcheck, and this is called "kmemcheck". It can be used
214to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT
215option. Its possible settings are:
216
217 o kmemcheck=0 (disabled)
218 o kmemcheck=1 (enabled)
219 o kmemcheck=2 (one-shot mode)
220
221If SLUB debugging has been enabled in the kernel, it may take precedence over
222kmemcheck in such a way that the slab caches which are under SLUB debugging
223will not be tracked by kmemcheck. In order to ensure that this doesn't happen
224(even though it shouldn't by default), use SLUB's boot option "slub_debug",
225like this: slub_debug=-
226
227In fact, this option may also be used for fine-grained control over SLUB vs.
228kmemcheck. For example, if the command line includes "kmemcheck=1
229slub_debug=,dentry", then SLUB debugging will be used only for the "dentry"
230slab cache, and with kmemcheck tracking all the other caches. This is advanced
231usage, however, and is not generally recommended.
232
233
2343.2. Run-time enable/disable
235============================
236
237When the kernel has booted, it is possible to enable or disable kmemcheck at
238run-time. WARNING: This feature is still experimental and may cause false
239positive warnings to appear. Therefore, try not to use this. If you find that
240it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
241will be happy to take bug reports.
242
243Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.:
244
245 $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
246
247The numbers are the same as for the kmemcheck= command-line option.
248
249
2503.3. Debugging
251==============
252
253A typical report will look something like this:
254
255WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
25680000000000000000000000000000000000000000088ffff0000000000000000
257 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
258 ^
259
260Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
261RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
262RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
263RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
264RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
265RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
266R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
267R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
268FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
269CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
270CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
271DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
272DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
273 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
274 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
275 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
276 [<ffffffff8100c7b5>] int_signal+0x12/0x17
277 [<ffffffffffffffff>] 0xffffffffffffffff
278
279The single most valuable information in this report is the RIP (or EIP on 32-
280bit) value. This will help us pinpoint exactly which instruction that caused
281the warning.
282
283If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do
284is give this address to the addr2line program, like this:
285
286 $ addr2line -e vmlinux -i ffffffff8104ede8
287 arch/x86/include/asm/string_64.h:12
288 include/asm-generic/siginfo.h:287
289 kernel/signal.c:380
290 kernel/signal.c:410
291
292The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must
293be the vmlinux of the kernel that produced the warning in the first place! If
294not, the line number information will almost certainly be wrong.
295
296The "-i" tells addr2line to also print the line numbers of inlined functions.
297In this case, the flag was very important, because otherwise, it would only
298have printed the first line, which is just a call to memcpy(), which could be
299called from a thousand places in the kernel, and is therefore not very useful.
300These inlined functions would not show up in the stack trace above, simply
301because the kernel doesn't load the extra debugging information. This
302technique can of course be used with ordinary kernel oopses as well.
303
304In this case, it's the caller of memcpy() that is interesting, and it can be
305found in include/asm-generic/siginfo.h, line 287:
306
307281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
308282 {
309283 if (from->si_code < 0)
310284 memcpy(to, from, sizeof(*to));
311285 else
312286 /* _sigchld is currently the largest know union member */
313287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
314288 }
315
316Since this was a read (kmemcheck usually warns about reads only, though it can
317warn about writes to unallocated or freed memory as well), it was probably the
318"from" argument which contained some uninitialized bytes. Following the chain
319of calls, we move upwards to see where "from" was allocated or initialized,
320kernel/signal.c, line 380:
321
322359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
323360 {
324...
325367 list_for_each_entry(q, &list->list, list) {
326368 if (q->info.si_signo == sig) {
327369 if (first)
328370 goto still_pending;
329371 first = q;
330...
331377 if (first) {
332378 still_pending:
333379 list_del_init(&first->list);
334380 copy_siginfo(info, &first->info);
335381 __sigqueue_free(first);
336...
337392 }
338393 }
339
340Here, it is &first->info that is being passed on to copy_siginfo(). The
341variable "first" was found on a list -- passed in as the second argument to
342collect_signal(). We continue our journey through the stack, to figure out
343where the item on "list" was allocated or initialized. We move to line 410:
344
345395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
346396 siginfo_t *info)
347397 {
348...
349410 collect_signal(sig, pending, info);
350...
351414 }
352
353Now we need to follow the "pending" pointer, since that is being passed on to
354collect_signal() as "list". At this point, we've run out of lines from the
355"addr2line" output. Not to worry, we just paste the next addresses from the
356kmemcheck stack dump, i.e.:
357
358 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
359 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
360 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
361 [<ffffffff8100c7b5>] int_signal+0x12/0x17
362
363 $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
364 ffffffff8100b87d ffffffff8100c7b5
365 kernel/signal.c:446
366 kernel/signal.c:1806
367 arch/x86/kernel/signal.c:805
368 arch/x86/kernel/signal.c:871
369 arch/x86/kernel/entry_64.S:694
370
371Remember that since these addresses were found on the stack and not as the
372RIP value, they actually point to the _next_ instruction (they are return
373addresses). This becomes obvious when we look at the code for line 446:
374
375422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
376423 {
377...
378431 signr = __dequeue_signal(&tsk->signal->shared_pending,
379432 mask, info);
380433 /*
381434 * itimer signal ?
382435 *
383436 * itimers are process shared and we restart periodic
384437 * itimers in the signal delivery path to prevent DoS
385438 * attacks in the high resolution timer case. This is
386439 * compliant with the old way of self restarting
387440 * itimers, as the SIGALRM is a legacy signal and only
388441 * queued once. Changing the restart behaviour to
389442 * restart the timer in the signal dequeue path is
390443 * reducing the timer noise on heavy loaded !highres
391444 * systems too.
392445 */
393446 if (unlikely(signr == SIGALRM)) {
394...
395489 }
396
397So instead of looking at 446, we should be looking at 431, which is the line
398that executes just before 446. Here we see that what we are looking for is
399&tsk->signal->shared_pending.
400
401Our next task is now to figure out which function that puts items on this
402"shared_pending" list. A crude, but efficient tool, is git grep:
403
404 $ git grep -n 'shared_pending' kernel/
405 ...
406 kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
407 kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
408 ...
409
410There were more results, but none of them were related to list operations,
411and these were the only assignments. We inspect the line numbers more closely
412and find that this is indeed where items are being added to the list:
413
414816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
415817 int group)
416818 {
417...
418828 pending = group ? &t->signal->shared_pending : &t->pending;
419...
420851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
421852 (is_si_special(info) ||
422853 info->si_code >= 0)));
423854 if (q) {
424855 list_add_tail(&q->list, &pending->list);
425...
426890 }
427
428and:
429
4301309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
4311310 {
432....
4331339 pending = group ? &t->signal->shared_pending : &t->pending;
4341340 list_add_tail(&q->list, &pending->list);
435....
4361347 }
437
438In the first case, the list element we are looking for, "q", is being returned
439from the function __sigqueue_alloc(), which looks like an allocation function.
440Let's take a look at it:
441
442187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
443188 int override_rlimit)
444189 {
445190 struct sigqueue *q = NULL;
446191 struct user_struct *user;
447192
448193 /*
449194 * We won't get problems with the target's UID changing under us
450195 * because changing it requires RCU be used, and if t != current, the
451196 * caller must be holding the RCU readlock (by way of a spinlock) and
452197 * we use RCU protection here
453198 */
454199 user = get_uid(__task_cred(t)->user);
455200 atomic_inc(&user->sigpending);
456201 if (override_rlimit ||
457202 atomic_read(&user->sigpending) <=
458203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
459204 q = kmem_cache_alloc(sigqueue_cachep, flags);
460205 if (unlikely(q == NULL)) {
461206 atomic_dec(&user->sigpending);
462207 free_uid(user);
463208 } else {
464209 INIT_LIST_HEAD(&q->list);
465210 q->flags = 0;
466211 q->user = user;
467212 }
468213
469214 return q;
470215 }
471
472We see that this function initializes q->list, q->flags, and q->user. It seems
473that now is the time to look at the definition of "struct sigqueue", e.g.:
474
47514 struct sigqueue {
47615 struct list_head list;
47716 int flags;
47817 siginfo_t info;
47918 struct user_struct *user;
48019 };
481
482And, you might remember, it was a memcpy() on &first->info that caused the
483warning, so this makes perfect sense. It also seems reasonable to assume that
484it is the caller of __sigqueue_alloc() that has the responsibility of filling
485out (initializing) this member.
486
487But just which fields of the struct were uninitialized? Let's look at
488kmemcheck's report again:
489
490WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
49180000000000000000000000000000000000000000088ffff0000000000000000
492 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
493 ^
494
495These first two lines are the memory dump of the memory object itself, and the
496shadow bytemap, respectively. The memory object itself is in this case
497&first->info. Just beware that the start of this dump is NOT the start of the
498object itself! The position of the caret (^) corresponds with the address of
499the read (ffff88003e4a2024).
500
501The shadow bytemap dump legend is as follows:
502
503 i - initialized
504 u - uninitialized
505 a - unallocated (memory has been allocated by the slab layer, but has not
506 yet been handed off to anybody)
507 f - freed (memory has been allocated by the slab layer, but has been freed
508 by the previous owner)
509
510In order to figure out where (relative to the start of the object) the
511uninitialized memory was located, we have to look at the disassembly. For
512that, we'll need the RIP address again:
513
514RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
515
516 $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
517 ffffffff8104edc8: mov %r8,0x8(%r8)
518 ffffffff8104edcc: test %r10d,%r10d
519 ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168>
520 ffffffff8104edd5: mov %rax,%rdx
521 ffffffff8104edd8: mov $0xc,%ecx
522 ffffffff8104eddd: mov %r13,%rdi
523 ffffffff8104ede0: mov $0x30,%eax
524 ffffffff8104ede5: mov %rdx,%rsi
525 ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi)
526 ffffffff8104edea: test $0x2,%al
527 ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0>
528 ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi)
529 ffffffff8104edf0: test $0x1,%al
530 ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5>
531 ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi)
532 ffffffff8104edf5: mov %r8,%rdi
533 ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free>
534
535As expected, it's the "rep movsl" instruction from the memcpy() that causes
536the warning. We know about REP MOVSL that it uses the register RCX to count
537the number of remaining iterations. By taking a look at the register dump
538again (from the kmemcheck report), we can figure out how many bytes were left
539to copy:
540
541RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
542
543By looking at the disassembly, we also see that %ecx is being loaded with the
544value $0xc just before (ffffffff8104edd8), so we are very lucky. Keep in mind
545that this is the number of iterations, not bytes. And since this is a "long"
546operation, we need to multiply by 4 to get the number of bytes. So this means
547that the uninitialized value was encountered at 4 * (0xc - 0x9) = 12 bytes
548from the start of the object.
549
550We can now try to figure out which field of the "struct siginfo" that was not
551initialized. This is the beginning of the struct:
552
55340 typedef struct siginfo {
55441 int si_signo;
55542 int si_errno;
55643 int si_code;
55744
55845 union {
559..
56092 } _sifields;
56193 } siginfo_t;
562
563On 64-bit, the int is 4 bytes long, so it must the the union member that has
564not been initialized. We can verify this using gdb:
565
566 $ gdb vmlinux
567 ...
568 (gdb) p &((struct siginfo *) 0)->_sifields
569 $1 = (union {...} *) 0x10
570
571Actually, it seems that the union member is located at offset 0x10 -- which
572means that gcc has inserted 4 bytes of padding between the members si_code
573and _sifields. We can now get a fuller picture of the memory dump:
574
575 _----------------------------=> si_code
576 / _--------------------=> (padding)
577 | / _------------=> _sifields(._kill._pid)
578 | | / _----=> _sifields(._kill._uid)
579 | | | /
580-------|-------|-------|-------|
58180000000000000000000000000000000000000000088ffff0000000000000000
582 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
583
584This allows us to realize another important fact: si_code contains the value
5850x80. Remember that x86 is little endian, so the first 4 bytes "80000000" are
586really the number 0x00000080. With a bit of research, we find that this is
587actually the constant SI_KERNEL defined in include/asm-generic/siginfo.h:
588
589144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
590
591This macro is used in exactly one place in the x86 kernel: In send_signal()
592in kernel/signal.c:
593
594816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
595817 int group)
596818 {
597...
598828 pending = group ? &t->signal->shared_pending : &t->pending;
599...
600851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
601852 (is_si_special(info) ||
602853 info->si_code >= 0)));
603854 if (q) {
604855 list_add_tail(&q->list, &pending->list);
605856 switch ((unsigned long) info) {
606...
607865 case (unsigned long) SEND_SIG_PRIV:
608866 q->info.si_signo = sig;
609867 q->info.si_errno = 0;
610868 q->info.si_code = SI_KERNEL;
611869 q->info.si_pid = 0;
612870 q->info.si_uid = 0;
613871 break;
614...
615890 }
616
617Not only does this match with the .si_code member, it also matches the place
618we found earlier when looking for where siginfo_t objects are enqueued on the
619"shared_pending" list.
620
621So to sum up: It seems that it is the padding introduced by the compiler
622between two struct fields that is uninitialized, and this gets reported when
623we do a memcpy() on the struct. This means that we have identified a false
624positive warning.
625
626Normally, kmemcheck will not report uninitialized accesses in memcpy() calls
627when both the source and destination addresses are tracked. (Instead, we copy
628the shadow bytemap as well). In this case, the destination address clearly
629was not tracked. We can dig a little deeper into the stack trace from above:
630
631 arch/x86/kernel/signal.c:805
632 arch/x86/kernel/signal.c:871
633 arch/x86/kernel/entry_64.S:694
634
635And we clearly see that the destination siginfo object is located on the
636stack:
637
638782 static void do_signal(struct pt_regs *regs)
639783 {
640784 struct k_sigaction ka;
641785 siginfo_t info;
642...
643804 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
644...
645854 }
646
647And this &info is what eventually gets passed to copy_siginfo() as the
648destination argument.
649
650Now, even though we didn't find an actual error here, the example is still a
651good one, because it shows how one would go about to find out what the report
652was all about.
653
654
6553.4. Annotating false positives
656===============================
657
658There are a few different ways to make annotations in the source code that
659will keep kmemcheck from checking and reporting certain allocations. Here
660they are:
661
662 o __GFP_NOTRACK_FALSE_POSITIVE
663
664 This flag can be passed to kmalloc() or kmem_cache_alloc() (therefore
665 also to other functions that end up calling one of these) to indicate
666 that the allocation should not be tracked because it would lead to
667 a false positive report. This is a "big hammer" way of silencing
668 kmemcheck; after all, even if the false positive pertains to
669 particular field in a struct, for example, we will now lose the
670 ability to find (real) errors in other parts of the same struct.
671
672 Example:
673
674 /* No warnings will ever trigger on accessing any part of x */
675 x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
676
677 o kmemcheck_bitfield_begin(name)/kmemcheck_bitfield_end(name) and
678 kmemcheck_annotate_bitfield(ptr, name)
679
680 The first two of these three macros can be used inside struct
681 definitions to signal, respectively, the beginning and end of a
682 bitfield. Additionally, this will assign the bitfield a name, which
683 is given as an argument to the macros.
684
685 Having used these markers, one can later use
686 kmemcheck_annotate_bitfield() at the point of allocation, to indicate
687 which parts of the allocation is part of a bitfield.
688
689 Example:
690
691 struct foo {
692 int x;
693
694 kmemcheck_bitfield_begin(flags);
695 int flag_a:1;
696 int flag_b:1;
697 kmemcheck_bitfield_end(flags);
698
699 int y;
700 };
701
702 struct foo *x = kmalloc(sizeof *x);
703
704 /* No warnings will trigger on accessing the bitfield of x */
705 kmemcheck_annotate_bitfield(x, flags);
706
707 Note that kmemcheck_annotate_bitfield() can be used even before the
708 return value of kmalloc() is checked -- in other words, passing NULL
709 as the first argument is legal (and will do nothing).
710
711
7124. Reporting errors
713===================
714
715As we have seen, kmemcheck will produce false positive reports. Therefore, it
716is not very wise to blindly post kmemcheck warnings to mailing lists and
717maintainers. Instead, I encourage maintainers and developers to find errors
718in their own code. If you get a warning, you can try to work around it, try
719to figure out if it's a real error or not, or simply ignore it. Most
720developers know their own code and will quickly and efficiently determine the
721root cause of a kmemcheck report. This is therefore also the most efficient
722way to work with kmemcheck.
723
724That said, we (the kmemcheck maintainers) will always be on the lookout for
725false positives that we can annotate and silence. So whatever you find,
726please drop us a note privately! Kernel configs and steps to reproduce (if
727available) are of course a great help too.
728
729Happy hacking!
730
731
7325. Technical description
733========================
734
735kmemcheck works by marking memory pages non-present. This means that whenever
736somebody attempts to access the page, a page fault is generated. The page
737fault handler notices that the page was in fact only hidden, and so it calls
738on the kmemcheck code to make further investigations.
739
740When the investigations are completed, kmemcheck "shows" the page by marking
741it present (as it would be under normal circumstances). This way, the
742interrupted code can continue as usual.
743
744But after the instruction has been executed, we should hide the page again, so
745that we can catch the next access too! Now kmemcheck makes use of a debugging
746feature of the processor, namely single-stepping. When the processor has
747finished the one instruction that generated the memory access, a debug
748exception is raised. From here, we simply hide the page again and continue
749execution, this time with the single-stepping feature turned off.
750
751kmemcheck requires some assistance from the memory allocator in order to work.
752The memory allocator needs to
753
754 1. Tell kmemcheck about newly allocated pages and pages that are about to
755 be freed. This allows kmemcheck to set up and tear down the shadow memory
756 for the pages in question. The shadow memory stores the status of each
757 byte in the allocation proper, e.g. whether it is initialized or
758 uninitialized.
759
760 2. Tell kmemcheck which parts of memory should be marked uninitialized.
761 There are actually a few more states, such as "not yet allocated" and
762 "recently freed".
763
764If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
765memory that can take page faults because of kmemcheck.
766
767If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
768request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
769This does not prevent the page faults from occurring, however, but marks the
770object in question as being initialized so that no warnings will ever be
771produced for this object.
772
773Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
new file mode 100644
index 000000000000..0112da3b9ab8
--- /dev/null
+++ b/Documentation/kmemleak.txt
@@ -0,0 +1,142 @@
1Kernel Memory Leak Detector
2===========================
3
4Introduction
5------------
6
7Kmemleak provides a way of detecting possible kernel memory leaks in a
8way similar to a tracing garbage collector
9(http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
10with the difference that the orphan objects are not freed but only
11reported via /sys/kernel/debug/kmemleak. A similar method is used by the
12Valgrind tool (memcheck --leak-check) to detect the memory leaks in
13user-space applications.
14
15Usage
16-----
17
18CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
19thread scans the memory every 10 minutes (by default) and prints any new
20unreferenced objects found. To trigger an intermediate scan and display
21all the possible memory leaks:
22
23 # mount -t debugfs nodev /sys/kernel/debug/
24 # cat /sys/kernel/debug/kmemleak
25
26Note that the orphan objects are listed in the order they were allocated
27and one object at the beginning of the list may cause other subsequent
28objects to be reported as orphan.
29
30Memory scanning parameters can be modified at run-time by writing to the
31/sys/kernel/debug/kmemleak file. The following parameters are supported:
32
33 off - disable kmemleak (irreversible)
34 stack=on - enable the task stacks scanning
35 stack=off - disable the tasks stacks scanning
36 scan=on - start the automatic memory scanning thread
37 scan=off - stop the automatic memory scanning thread
38 scan=<secs> - set the automatic memory scanning period in seconds (0
39 to disable it)
40
41Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
42the kernel command line.
43
44Basic Algorithm
45---------------
46
47The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and
48friends are traced and the pointers, together with additional
49information like size and stack trace, are stored in a prio search tree.
50The corresponding freeing function calls are tracked and the pointers
51removed from the kmemleak data structures.
52
53An allocated block of memory is considered orphan if no pointer to its
54start address or to any location inside the block can be found by
55scanning the memory (including saved registers). This means that there
56might be no way for the kernel to pass the address of the allocated
57block to a freeing function and therefore the block is considered a
58memory leak.
59
60The scanning algorithm steps:
61
62 1. mark all objects as white (remaining white objects will later be
63 considered orphan)
64 2. scan the memory starting with the data section and stacks, checking
65 the values against the addresses stored in the prio search tree. If
66 a pointer to a white object is found, the object is added to the
67 gray list
68 3. scan the gray objects for matching addresses (some white objects
69 can become gray and added at the end of the gray list) until the
70 gray set is finished
71 4. the remaining white objects are considered orphan and reported via
72 /sys/kernel/debug/kmemleak
73
74Some allocated memory blocks have pointers stored in the kernel's
75internal data structures and they cannot be detected as orphans. To
76avoid this, kmemleak can also store the number of values pointing to an
77address inside the block address range that need to be found so that the
78block is not considered a leak. One example is __vmalloc().
79
80Kmemleak API
81------------
82
83See the include/linux/kmemleak.h header for the functions prototype.
84
85kmemleak_init - initialize kmemleak
86kmemleak_alloc - notify of a memory block allocation
87kmemleak_free - notify of a memory block freeing
88kmemleak_not_leak - mark an object as not a leak
89kmemleak_ignore - do not scan or report an object as leak
90kmemleak_scan_area - add scan areas inside a memory block
91kmemleak_no_scan - do not scan a memory block
92kmemleak_erase - erase an old value in a pointer variable
93kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness
94kmemleak_free_recursive - as kmemleak_free but checks the recursiveness
95
96Dealing with false positives/negatives
97--------------------------------------
98
99The false negatives are real memory leaks (orphan objects) but not
100reported by kmemleak because values found during the memory scanning
101point to such objects. To reduce the number of false negatives, kmemleak
102provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and
103kmemleak_erase functions (see above). The task stacks also increase the
104amount of false negatives and their scanning is not enabled by default.
105
106The false positives are objects wrongly reported as being memory leaks
107(orphan). For objects known not to be leaks, kmemleak provides the
108kmemleak_not_leak function. The kmemleak_ignore could also be used if
109the memory block is known not to contain other pointers and it will no
110longer be scanned.
111
112Some of the reported leaks are only transient, especially on SMP
113systems, because of pointers temporarily stored in CPU registers or
114stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing
115the minimum age of an object to be reported as a memory leak.
116
117Limitations and Drawbacks
118-------------------------
119
120The main drawback is the reduced performance of memory allocation and
121freeing. To avoid other penalties, the memory scanning is only performed
122when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is
123intended for debugging purposes where the performance might not be the
124most important requirement.
125
126To keep the algorithm simple, kmemleak scans for values pointing to any
127address inside a block's address range. This may lead to an increased
128number of false negatives. However, it is likely that a real memory leak
129will eventually become visible.
130
131Another source of false negatives is the data stored in non-pointer
132values. In a future version, kmemleak could only scan the pointer
133members in the allocated structures. This feature would solve many of
134the false negative cases described above.
135
136The tool can report false positives. These are cases where an allocated
137block doesn't need to be freed (some cases in the init_call functions),
138the pointer is calculated by other methods than the usual container_of
139macro or the pointer is stored in a location not scanned by kmemleak.
140
141Page allocations and ioremap are not tracked. Only the ARM and x86
142architectures are currently supported.
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index b2e374586bd8..c79ab996dada 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -132,7 +132,7 @@ kobject_name():
132 const char *kobject_name(const struct kobject * kobj); 132 const char *kobject_name(const struct kobject * kobj);
133 133
134There is a helper function to both initialize and add the kobject to the 134There is a helper function to both initialize and add the kobject to the
135kernel at the same time, called supprisingly enough kobject_init_and_add(): 135kernel at the same time, called surprisingly enough kobject_init_and_add():
136 136
137 int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, 137 int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
138 struct kobject *parent, const char *fmt, ...); 138 struct kobject *parent, const char *fmt, ...);
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 1e7a769a10f9..053037a1fe6d 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -507,9 +507,9 @@ http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
507Appendix A: The kprobes debugfs interface 507Appendix A: The kprobes debugfs interface
508 508
509With recent kernels (> 2.6.20) the list of registered kprobes is visible 509With recent kernels (> 2.6.20) the list of registered kprobes is visible
510under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug). 510under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at //sys/kernel/debug).
511 511
512/debug/kprobes/list: Lists all registered probes on the system 512/sys/kernel/debug/kprobes/list: Lists all registered probes on the system
513 513
514c015d71a k vfs_read+0x0 514c015d71a k vfs_read+0x0
515c011a316 j do_fork+0x0 515c011a316 j do_fork+0x0
@@ -525,7 +525,7 @@ virtual addresses that correspond to modules that've been unloaded),
525such probes are marked with [GONE]. If the probe is temporarily disabled, 525such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED]. 526such probes are marked with [DISABLED].
527 527
528/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. 528/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
529 529
530Provides a knob to globally and forcibly turn registered kprobes ON or OFF. 530Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
531By default, all kprobes are enabled. By echoing "0" to this file, all 531By default, all kprobes are enabled. By echoing "0" to this file, all
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
index 5ee2a02b3b40..0768fcc3ba3e 100644
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -40,7 +40,7 @@ NOTE: The Acer Aspire One is not supported hardware. It cannot work with
40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been 40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
41blacklisted until that happens. 41blacklisted until that happens.
42 42
43Please see the website for the current list of known working hardare: 43Please see the website for the current list of known working hardware:
44 44
45http://code.google.com/p/aceracpi/wiki/SupportedHardware 45http://code.google.com/p/aceracpi/wiki/SupportedHardware
46 46
diff --git a/Documentation/laptops/sony-laptop.txt b/Documentation/laptops/sony-laptop.txt
index 8b2bc1572d98..23ce7d350d1a 100644
--- a/Documentation/laptops/sony-laptop.txt
+++ b/Documentation/laptops/sony-laptop.txt
@@ -22,7 +22,7 @@ If your laptop model supports it, you will find sysfs files in the
22/sys/class/backlight/sony/ 22/sys/class/backlight/sony/
23directory. You will be able to query and set the current screen 23directory. You will be able to query and set the current screen
24brightness: 24brightness:
25 brightness get/set screen brightness (an iteger 25 brightness get/set screen brightness (an integer
26 between 0 and 7) 26 between 0 and 7)
27 actual_brightness reading from this file will query the HW 27 actual_brightness reading from this file will query the HW
28 to get real brightness value 28 to get real brightness value
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index e7e9a69069e1..78e354b42f67 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -506,7 +506,7 @@ generate input device EV_KEY events.
506In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW 506In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
507events for switches: 507events for switches:
508 508
509SW_RFKILL_ALL T60 and later hardare rfkill rocker switch 509SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
510SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A 510SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
511 511
512Non hot-key ACPI HKEY event map: 512Non hot-key ACPI HKEY event map:
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 1f4f9e888bd1..28c8cdfcafd8 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,6 +1,5 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE 2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
3LDLIBS:=-lz
4 3
5all: lguest 4all: lguest
6 5
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d36fcc0f2715..9ebcd6ef361b 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
16#include <sys/types.h> 16#include <sys/types.h>
17#include <sys/stat.h> 17#include <sys/stat.h>
18#include <sys/wait.h> 18#include <sys/wait.h>
19#include <sys/eventfd.h>
19#include <fcntl.h> 20#include <fcntl.h>
20#include <stdbool.h> 21#include <stdbool.h>
21#include <errno.h> 22#include <errno.h>
@@ -59,7 +60,6 @@ typedef uint8_t u8;
59/*:*/ 60/*:*/
60 61
61#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 62#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
62#define NET_PEERNUM 1
63#define BRIDGE_PFX "bridge:" 63#define BRIDGE_PFX "bridge:"
64#ifndef SIOCBRADDIF 64#ifndef SIOCBRADDIF
65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -76,19 +76,12 @@ static bool verbose;
76 do { if (verbose) printf(args); } while(0) 76 do { if (verbose) printf(args); } while(0)
77/*:*/ 77/*:*/
78 78
79/* File descriptors for the Waker. */
80struct {
81 int pipe[2];
82 int lguest_fd;
83} waker_fds;
84
85/* The pointer to the start of guest memory. */ 79/* The pointer to the start of guest memory. */
86static void *guest_base; 80static void *guest_base;
87/* The maximum guest physical address allowed, and maximum possible. */ 81/* The maximum guest physical address allowed, and maximum possible. */
88static unsigned long guest_limit, guest_max; 82static unsigned long guest_limit, guest_max;
89/* The pipe for signal hander to write to. */ 83/* The /dev/lguest file descriptor. */
90static int timeoutpipe[2]; 84static int lguest_fd;
91static unsigned int timeout_usec = 500;
92 85
93/* a per-cpu variable indicating whose vcpu is currently running */ 86/* a per-cpu variable indicating whose vcpu is currently running */
94static unsigned int __thread cpu_id; 87static unsigned int __thread cpu_id;
@@ -96,11 +89,6 @@ static unsigned int __thread cpu_id;
96/* This is our list of devices. */ 89/* This is our list of devices. */
97struct device_list 90struct device_list
98{ 91{
99 /* Summary information about the devices in our list: ready to pass to
100 * select() to ask which need servicing.*/
101 fd_set infds;
102 int max_infd;
103
104 /* Counter to assign interrupt numbers. */ 92 /* Counter to assign interrupt numbers. */
105 unsigned int next_irq; 93 unsigned int next_irq;
106 94
@@ -126,22 +114,21 @@ struct device
126 /* The linked-list pointer. */ 114 /* The linked-list pointer. */
127 struct device *next; 115 struct device *next;
128 116
129 /* The this device's descriptor, as mapped into the Guest. */ 117 /* The device's descriptor, as mapped into the Guest. */
130 struct lguest_device_desc *desc; 118 struct lguest_device_desc *desc;
131 119
120 /* We can't trust desc values once Guest has booted: we use these. */
121 unsigned int feature_len;
122 unsigned int num_vq;
123
132 /* The name of this device, for --verbose. */ 124 /* The name of this device, for --verbose. */
133 const char *name; 125 const char *name;
134 126
135 /* If handle_input is set, it wants to be called when this file
136 * descriptor is ready. */
137 int fd;
138 bool (*handle_input)(int fd, struct device *me);
139
140 /* Any queues attached to this device */ 127 /* Any queues attached to this device */
141 struct virtqueue *vq; 128 struct virtqueue *vq;
142 129
143 /* Handle status being finalized (ie. feature bits stable). */ 130 /* Is it operational */
144 void (*ready)(struct device *me); 131 bool running;
145 132
146 /* Device-specific data. */ 133 /* Device-specific data. */
147 void *priv; 134 void *priv;
@@ -164,22 +151,28 @@ struct virtqueue
164 /* Last available index we saw. */ 151 /* Last available index we saw. */
165 u16 last_avail_idx; 152 u16 last_avail_idx;
166 153
167 /* The routine to call when the Guest pings us, or timeout. */ 154 /* How many are used since we sent last irq? */
168 void (*handle_output)(int fd, struct virtqueue *me, bool timeout); 155 unsigned int pending_used;
169 156
170 /* Outstanding buffers */ 157 /* Eventfd where Guest notifications arrive. */
171 unsigned int inflight; 158 int eventfd;
172 159
173 /* Is this blocked awaiting a timer? */ 160 /* Function for the thread which is servicing this virtqueue. */
174 bool blocked; 161 void (*service)(struct virtqueue *vq);
162 pid_t thread;
175}; 163};
176 164
177/* Remember the arguments to the program so we can "reboot" */ 165/* Remember the arguments to the program so we can "reboot" */
178static char **main_args; 166static char **main_args;
179 167
180/* Since guest is UP and we don't run at the same time, we don't need barriers. 168/* The original tty settings to restore on exit. */
181 * But I include them in the code in case others copy it. */ 169static struct termios orig_term;
182#define wmb() 170
171/* We have to be careful with barriers: our devices are all run in separate
172 * threads and so we need to make sure that changes visible to the Guest happen
173 * in precise order. */
174#define wmb() __asm__ __volatile__("" : : : "memory")
175#define mb() __asm__ __volatile__("" : : : "memory")
183 176
184/* Convert an iovec element to the given type. 177/* Convert an iovec element to the given type.
185 * 178 *
@@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
245static u8 *get_feature_bits(struct device *dev) 238static u8 *get_feature_bits(struct device *dev)
246{ 239{
247 return (u8 *)(dev->desc + 1) 240 return (u8 *)(dev->desc + 1)
248 + dev->desc->num_vq * sizeof(struct lguest_vqconfig); 241 + dev->num_vq * sizeof(struct lguest_vqconfig);
249} 242}
250 243
251/*L:100 The Launcher code itself takes us out into userspace, that scary place 244/*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -505,99 +498,19 @@ static void concat(char *dst, char *args[])
505 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 498 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
506 * the base of Guest "physical" memory, the top physical page to allow and the 499 * the base of Guest "physical" memory, the top physical page to allow and the
507 * entry point for the Guest. */ 500 * entry point for the Guest. */
508static int tell_kernel(unsigned long start) 501static void tell_kernel(unsigned long start)
509{ 502{
510 unsigned long args[] = { LHREQ_INITIALIZE, 503 unsigned long args[] = { LHREQ_INITIALIZE,
511 (unsigned long)guest_base, 504 (unsigned long)guest_base,
512 guest_limit / getpagesize(), start }; 505 guest_limit / getpagesize(), start };
513 int fd;
514
515 verbose("Guest: %p - %p (%#lx)\n", 506 verbose("Guest: %p - %p (%#lx)\n",
516 guest_base, guest_base + guest_limit, guest_limit); 507 guest_base, guest_base + guest_limit, guest_limit);
517 fd = open_or_die("/dev/lguest", O_RDWR); 508 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
518 if (write(fd, args, sizeof(args)) < 0) 509 if (write(lguest_fd, args, sizeof(args)) < 0)
519 err(1, "Writing to /dev/lguest"); 510 err(1, "Writing to /dev/lguest");
520
521 /* We return the /dev/lguest file descriptor to control this Guest */
522 return fd;
523} 511}
524/*:*/ 512/*:*/
525 513
526static void add_device_fd(int fd)
527{
528 FD_SET(fd, &devices.infds);
529 if (fd > devices.max_infd)
530 devices.max_infd = fd;
531}
532
533/*L:200
534 * The Waker.
535 *
536 * With console, block and network devices, we can have lots of input which we
537 * need to process. We could try to tell the kernel what file descriptors to
538 * watch, but handing a file descriptor mask through to the kernel is fairly
539 * icky.
540 *
541 * Instead, we clone off a thread which watches the file descriptors and writes
542 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
543 * stop running the Guest. This causes the Launcher to return from the
544 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
545 * the LHREQ_BREAK and wake us up again.
546 *
547 * This, of course, is merely a different *kind* of icky.
548 *
549 * Given my well-known antipathy to threads, I'd prefer to use processes. But
550 * it's easier to share Guest memory with threads, and trivial to share the
551 * devices.infds as the Launcher changes it.
552 */
553static int waker(void *unused)
554{
555 /* Close the write end of the pipe: only the Launcher has it open. */
556 close(waker_fds.pipe[1]);
557
558 for (;;) {
559 fd_set rfds = devices.infds;
560 unsigned long args[] = { LHREQ_BREAK, 1 };
561 unsigned int maxfd = devices.max_infd;
562
563 /* We also listen to the pipe from the Launcher. */
564 FD_SET(waker_fds.pipe[0], &rfds);
565 if (waker_fds.pipe[0] > maxfd)
566 maxfd = waker_fds.pipe[0];
567
568 /* Wait until input is ready from one of the devices. */
569 select(maxfd+1, &rfds, NULL, NULL, NULL);
570
571 /* Message from Launcher? */
572 if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
573 char c;
574 /* If this fails, then assume Launcher has exited.
575 * Don't do anything on exit: we're just a thread! */
576 if (read(waker_fds.pipe[0], &c, 1) != 1)
577 _exit(0);
578 continue;
579 }
580
581 /* Send LHREQ_BREAK command to snap the Launcher out of it. */
582 pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
583 }
584 return 0;
585}
586
587/* This routine just sets up a pipe to the Waker process. */
588static void setup_waker(int lguest_fd)
589{
590 /* This pipe is closed when Launcher dies, telling Waker. */
591 if (pipe(waker_fds.pipe) != 0)
592 err(1, "Creating pipe for Waker");
593
594 /* Waker also needs to know the lguest fd */
595 waker_fds.lguest_fd = lguest_fd;
596
597 if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
598 err(1, "Creating Waker");
599}
600
601/* 514/*
602 * Device Handling. 515 * Device Handling.
603 * 516 *
@@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
623/* Each buffer in the virtqueues is actually a chain of descriptors. This 536/* Each buffer in the virtqueues is actually a chain of descriptors. This
624 * function returns the next descriptor in the chain, or vq->vring.num if we're 537 * function returns the next descriptor in the chain, or vq->vring.num if we're
625 * at the end. */ 538 * at the end. */
626static unsigned next_desc(struct virtqueue *vq, unsigned int i) 539static unsigned next_desc(struct vring_desc *desc,
540 unsigned int i, unsigned int max)
627{ 541{
628 unsigned int next; 542 unsigned int next;
629 543
630 /* If this descriptor says it doesn't chain, we're done. */ 544 /* If this descriptor says it doesn't chain, we're done. */
631 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) 545 if (!(desc[i].flags & VRING_DESC_F_NEXT))
632 return vq->vring.num; 546 return max;
633 547
634 /* Check they're not leading us off end of descriptors. */ 548 /* Check they're not leading us off end of descriptors. */
635 next = vq->vring.desc[i].next; 549 next = desc[i].next;
636 /* Make sure compiler knows to grab that: we don't want it changing! */ 550 /* Make sure compiler knows to grab that: we don't want it changing! */
637 wmb(); 551 wmb();
638 552
639 if (next >= vq->vring.num) 553 if (next >= max)
640 errx(1, "Desc next is %u", next); 554 errx(1, "Desc next is %u", next);
641 555
642 return next; 556 return next;
643} 557}
644 558
559/* This actually sends the interrupt for this virtqueue */
560static void trigger_irq(struct virtqueue *vq)
561{
562 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
563
564 /* Don't inform them if nothing used. */
565 if (!vq->pending_used)
566 return;
567 vq->pending_used = 0;
568
569 /* If they don't want an interrupt, don't send one, unless empty. */
570 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
571 && lg_last_avail(vq) != vq->vring.avail->idx)
572 return;
573
574 /* Send the Guest an interrupt tell them we used something up. */
575 if (write(lguest_fd, buf, sizeof(buf)) != 0)
576 err(1, "Triggering irq %i", vq->config.irq);
577}
578
645/* This looks in the virtqueue and for the first available buffer, and converts 579/* This looks in the virtqueue and for the first available buffer, and converts
646 * it to an iovec for convenient access. Since descriptors consist of some 580 * it to an iovec for convenient access. Since descriptors consist of some
647 * number of output then some number of input descriptors, it's actually two 581 * number of output then some number of input descriptors, it's actually two
648 * iovecs, but we pack them into one and note how many of each there were. 582 * iovecs, but we pack them into one and note how many of each there were.
649 * 583 *
650 * This function returns the descriptor number found, or vq->vring.num (which 584 * This function returns the descriptor number found. */
651 * is never a valid descriptor number) if none was found. */ 585static unsigned wait_for_vq_desc(struct virtqueue *vq,
652static unsigned get_vq_desc(struct virtqueue *vq, 586 struct iovec iov[],
653 struct iovec iov[], 587 unsigned int *out_num, unsigned int *in_num)
654 unsigned int *out_num, unsigned int *in_num)
655{ 588{
656 unsigned int i, head; 589 unsigned int i, head, max;
657 u16 last_avail; 590 struct vring_desc *desc;
591 u16 last_avail = lg_last_avail(vq);
592
593 while (last_avail == vq->vring.avail->idx) {
594 u64 event;
595
596 /* OK, tell Guest about progress up to now. */
597 trigger_irq(vq);
598
599 /* OK, now we need to know about added descriptors. */
600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
601
602 /* They could have slipped one in as we were doing that: make
603 * sure it's written, then check again. */
604 mb();
605 if (last_avail != vq->vring.avail->idx) {
606 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
607 break;
608 }
609
610 /* Nothing new? Wait for eventfd to tell us they refilled. */
611 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
612 errx(1, "Event read failed?");
613
614 /* We don't need to be notified again. */
615 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
616 }
658 617
659 /* Check it isn't doing very strange things with descriptor numbers. */ 618 /* Check it isn't doing very strange things with descriptor numbers. */
660 last_avail = lg_last_avail(vq);
661 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 619 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
662 errx(1, "Guest moved used index from %u to %u", 620 errx(1, "Guest moved used index from %u to %u",
663 last_avail, vq->vring.avail->idx); 621 last_avail, vq->vring.avail->idx);
664 622
665 /* If there's nothing new since last we looked, return invalid. */
666 if (vq->vring.avail->idx == last_avail)
667 return vq->vring.num;
668
669 /* Grab the next descriptor number they're advertising, and increment 623 /* Grab the next descriptor number they're advertising, and increment
670 * the index we've seen. */ 624 * the index we've seen. */
671 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 625 head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq,
678 /* When we start there are none of either input nor output. */ 632 /* When we start there are none of either input nor output. */
679 *out_num = *in_num = 0; 633 *out_num = *in_num = 0;
680 634
635 max = vq->vring.num;
636 desc = vq->vring.desc;
681 i = head; 637 i = head;
638
639 /* If this is an indirect entry, then this buffer contains a descriptor
640 * table which we handle as if it's any normal descriptor chain. */
641 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
642 if (desc[i].len % sizeof(struct vring_desc))
643 errx(1, "Invalid size for indirect buffer table");
644
645 max = desc[i].len / sizeof(struct vring_desc);
646 desc = check_pointer(desc[i].addr, desc[i].len);
647 i = 0;
648 }
649
682 do { 650 do {
683 /* Grab the first descriptor, and check it's OK. */ 651 /* Grab the first descriptor, and check it's OK. */
684 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; 652 iov[*out_num + *in_num].iov_len = desc[i].len;
685 iov[*out_num + *in_num].iov_base 653 iov[*out_num + *in_num].iov_base
686 = check_pointer(vq->vring.desc[i].addr, 654 = check_pointer(desc[i].addr, desc[i].len);
687 vq->vring.desc[i].len);
688 /* If this is an input descriptor, increment that count. */ 655 /* If this is an input descriptor, increment that count. */
689 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) 656 if (desc[i].flags & VRING_DESC_F_WRITE)
690 (*in_num)++; 657 (*in_num)++;
691 else { 658 else {
692 /* If it's an output descriptor, they're all supposed 659 /* If it's an output descriptor, they're all supposed
@@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq,
697 } 664 }
698 665
699 /* If we've got too many, that implies a descriptor loop. */ 666 /* If we've got too many, that implies a descriptor loop. */
700 if (*out_num + *in_num > vq->vring.num) 667 if (*out_num + *in_num > max)
701 errx(1, "Looped descriptor"); 668 errx(1, "Looped descriptor");
702 } while ((i = next_desc(vq, i)) != vq->vring.num); 669 } while ((i = next_desc(desc, i, max)) != max);
703 670
704 vq->inflight++;
705 return head; 671 return head;
706} 672}
707 673
@@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
719 /* Make sure buffer is written before we update index. */ 685 /* Make sure buffer is written before we update index. */
720 wmb(); 686 wmb();
721 vq->vring.used->idx++; 687 vq->vring.used->idx++;
722 vq->inflight--; 688 vq->pending_used++;
723}
724
725/* This actually sends the interrupt for this virtqueue */
726static void trigger_irq(int fd, struct virtqueue *vq)
727{
728 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
729
730 /* If they don't want an interrupt, don't send one, unless empty. */
731 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
732 && vq->inflight)
733 return;
734
735 /* Send the Guest an interrupt tell them we used something up. */
736 if (write(fd, buf, sizeof(buf)) != 0)
737 err(1, "Triggering irq %i", vq->config.irq);
738} 689}
739 690
740/* And here's the combo meal deal. Supersize me! */ 691/* And here's the combo meal deal. Supersize me! */
741static void add_used_and_trigger(int fd, struct virtqueue *vq, 692static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
742 unsigned int head, int len)
743{ 693{
744 add_used(vq, head, len); 694 add_used(vq, head, len);
745 trigger_irq(fd, vq); 695 trigger_irq(vq);
746} 696}
747 697
748/* 698/*
749 * The Console 699 * The Console
750 * 700 *
751 * Here is the input terminal setting we save, and the routine to restore them 701 * We associate some data with the console for our exit hack. */
752 * on exit so the user gets their terminal back. */
753static struct termios orig_term;
754static void restore_term(void)
755{
756 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
757}
758
759/* We associate some data with the console for our exit hack. */
760struct console_abort 702struct console_abort
761{ 703{
762 /* How many times have they hit ^C? */ 704 /* How many times have they hit ^C? */
@@ -766,276 +708,275 @@ struct console_abort
766}; 708};
767 709
768/* This is the routine which handles console input (ie. stdin). */ 710/* This is the routine which handles console input (ie. stdin). */
769static bool handle_console_input(int fd, struct device *dev) 711static void console_input(struct virtqueue *vq)
770{ 712{
771 int len; 713 int len;
772 unsigned int head, in_num, out_num; 714 unsigned int head, in_num, out_num;
773 struct iovec iov[dev->vq->vring.num]; 715 struct console_abort *abort = vq->dev->priv;
774 struct console_abort *abort = dev->priv; 716 struct iovec iov[vq->vring.num];
775
776 /* First we need a console buffer from the Guests's input virtqueue. */
777 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
778
779 /* If they're not ready for input, stop listening to this file
780 * descriptor. We'll start again once they add an input buffer. */
781 if (head == dev->vq->vring.num)
782 return false;
783 717
718 /* Make sure there's a descriptor waiting. */
719 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
784 if (out_num) 720 if (out_num)
785 errx(1, "Output buffers in console in queue?"); 721 errx(1, "Output buffers in console in queue?");
786 722
787 /* This is why we convert to iovecs: the readv() call uses them, and so 723 /* Read it in. */
788 * it reads straight into the Guest's buffer. */ 724 len = readv(STDIN_FILENO, iov, in_num);
789 len = readv(dev->fd, iov, in_num);
790 if (len <= 0) { 725 if (len <= 0) {
791 /* This implies that the console is closed, is /dev/null, or 726 /* Ran out of input? */
792 * something went terribly wrong. */
793 warnx("Failed to get console input, ignoring console."); 727 warnx("Failed to get console input, ignoring console.");
794 /* Put the input terminal back. */ 728 /* For simplicity, dying threads kill the whole Launcher. So
795 restore_term(); 729 * just nap here. */
796 /* Remove callback from input vq, so it doesn't restart us. */ 730 for (;;)
797 dev->vq->handle_output = NULL; 731 pause();
798 /* Stop listening to this fd: don't call us again. */
799 return false;
800 } 732 }
801 733
802 /* Tell the Guest about the new input. */ 734 add_used_and_trigger(vq, head, len);
803 add_used_and_trigger(fd, dev->vq, head, len);
804 735
805 /* Three ^C within one second? Exit. 736 /* Three ^C within one second? Exit.
806 * 737 *
807 * This is such a hack, but works surprisingly well. Each ^C has to be 738 * This is such a hack, but works surprisingly well. Each ^C has to
808 * in a buffer by itself, so they can't be too fast. But we check that 739 * be in a buffer by itself, so they can't be too fast. But we check
809 * we get three within about a second, so they can't be too slow. */ 740 * that we get three within about a second, so they can't be too
810 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { 741 * slow. */
811 if (!abort->count++) 742 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
812 gettimeofday(&abort->start, NULL);
813 else if (abort->count == 3) {
814 struct timeval now;
815 gettimeofday(&now, NULL);
816 if (now.tv_sec <= abort->start.tv_sec+1) {
817 unsigned long args[] = { LHREQ_BREAK, 0 };
818 /* Close the fd so Waker will know it has to
819 * exit. */
820 close(waker_fds.pipe[1]);
821 /* Just in case Waker is blocked in BREAK, send
822 * unbreak now. */
823 write(fd, args, sizeof(args));
824 exit(2);
825 }
826 abort->count = 0;
827 }
828 } else
829 /* Any other key resets the abort counter. */
830 abort->count = 0; 743 abort->count = 0;
744 return;
745 }
831 746
832 /* Everything went OK! */ 747 abort->count++;
833 return true; 748 if (abort->count == 1)
749 gettimeofday(&abort->start, NULL);
750 else if (abort->count == 3) {
751 struct timeval now;
752 gettimeofday(&now, NULL);
753 /* Kill all Launcher processes with SIGINT, like normal ^C */
754 if (now.tv_sec <= abort->start.tv_sec+1)
755 kill(0, SIGINT);
756 abort->count = 0;
757 }
834} 758}
835 759
836/* Handling output for console is simple: we just get all the output buffers 760/* This is the routine which handles console output (ie. stdout). */
837 * and write them to stdout. */ 761static void console_output(struct virtqueue *vq)
838static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
839{ 762{
840 unsigned int head, out, in; 763 unsigned int head, out, in;
841 int len;
842 struct iovec iov[vq->vring.num]; 764 struct iovec iov[vq->vring.num];
843 765
844 /* Keep getting output buffers from the Guest until we run out. */ 766 head = wait_for_vq_desc(vq, iov, &out, &in);
845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 767 if (in)
846 if (in) 768 errx(1, "Input buffers in console output queue?");
847 errx(1, "Input buffers in output queue?"); 769 while (!iov_empty(iov, out)) {
848 len = writev(STDOUT_FILENO, iov, out); 770 int len = writev(STDOUT_FILENO, iov, out);
849 add_used_and_trigger(fd, vq, head, len); 771 if (len <= 0)
772 err(1, "Write to stdout gave %i", len);
773 iov_consume(iov, out, len);
850 } 774 }
851} 775 add_used(vq, head, 0);
852
853/* This is called when we no longer want to hear about Guest changes to a
854 * virtqueue. This is more efficient in high-traffic cases, but it means we
855 * have to set a timer to check if any more changes have occurred. */
856static void block_vq(struct virtqueue *vq)
857{
858 struct itimerval itm;
859
860 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
861 vq->blocked = true;
862
863 itm.it_interval.tv_sec = 0;
864 itm.it_interval.tv_usec = 0;
865 itm.it_value.tv_sec = 0;
866 itm.it_value.tv_usec = timeout_usec;
867
868 setitimer(ITIMER_REAL, &itm, NULL);
869} 776}
870 777
871/* 778/*
872 * The Network 779 * The Network
873 * 780 *
874 * Handling output for network is also simple: we get all the output buffers 781 * Handling output for network is also simple: we get all the output buffers
875 * and write them (ignoring the first element) to this device's file descriptor 782 * and write them to /dev/net/tun.
876 * (/dev/net/tun).
877 */ 783 */
878static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) 784struct net_info {
785 int tunfd;
786};
787
788static void net_output(struct virtqueue *vq)
879{ 789{
880 unsigned int head, out, in, num = 0; 790 struct net_info *net_info = vq->dev->priv;
881 int len; 791 unsigned int head, out, in;
882 struct iovec iov[vq->vring.num]; 792 struct iovec iov[vq->vring.num];
883 static int last_timeout_num;
884
885 /* Keep getting output buffers from the Guest until we run out. */
886 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
887 if (in)
888 errx(1, "Input buffers in output queue?");
889 len = writev(vq->dev->fd, iov, out);
890 if (len < 0)
891 err(1, "Writing network packet to tun");
892 add_used_and_trigger(fd, vq, head, len);
893 num++;
894 }
895 793
896 /* Block further kicks and set up a timer if we saw anything. */ 794 head = wait_for_vq_desc(vq, iov, &out, &in);
897 if (!timeout && num) 795 if (in)
898 block_vq(vq); 796 errx(1, "Input buffers in net output queue?");
899 797 if (writev(net_info->tunfd, iov, out) < 0)
900 /* We never quite know how long should we wait before we check the 798 errx(1, "Write to tun failed?");
901 * queue again for more packets. We start at 500 microseconds, and if 799 add_used(vq, head, 0);
902 * we get fewer packets than last time, we assume we made the timeout 800}
903 * too small and increase it by 10 microseconds. Otherwise, we drop it 801
904 * by one microsecond every time. It seems to work well enough. */ 802/* Will reading from this file descriptor block? */
905 if (timeout) { 803static bool will_block(int fd)
906 if (num < last_timeout_num) 804{
907 timeout_usec += 10; 805 fd_set fdset;
908 else if (timeout_usec > 1) 806 struct timeval zero = { 0, 0 };
909 timeout_usec--; 807 FD_ZERO(&fdset);
910 last_timeout_num = num; 808 FD_SET(fd, &fdset);
911 } 809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
912} 810}
913 811
914/* This is where we handle a packet coming in from the tun device to our 812/* This is where we handle packets coming in from the tun device to our
915 * Guest. */ 813 * Guest. */
916static bool handle_tun_input(int fd, struct device *dev) 814static void net_input(struct virtqueue *vq)
917{ 815{
918 unsigned int head, in_num, out_num;
919 int len; 816 int len;
920 struct iovec iov[dev->vq->vring.num]; 817 unsigned int head, out, in;
921 818 struct iovec iov[vq->vring.num];
922 /* First we need a network buffer from the Guests's recv virtqueue. */ 819 struct net_info *net_info = vq->dev->priv;
923 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
924 if (head == dev->vq->vring.num) {
925 /* Now, it's expected that if we try to send a packet too
926 * early, the Guest won't be ready yet. Wait until the device
927 * status says it's ready. */
928 /* FIXME: Actually want DRIVER_ACTIVE here. */
929
930 /* Now tell it we want to know if new things appear. */
931 dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
932 wmb();
933
934 /* We'll turn this back on if input buffers are registered. */
935 return false;
936 } else if (out_num)
937 errx(1, "Output buffers in network recv queue?");
938
939 /* Read the packet from the device directly into the Guest's buffer. */
940 len = readv(dev->fd, iov, in_num);
941 if (len <= 0)
942 err(1, "reading network");
943 820
944 /* Tell the Guest about the new packet. */ 821 head = wait_for_vq_desc(vq, iov, &out, &in);
945 add_used_and_trigger(fd, dev->vq, head, len); 822 if (out)
823 errx(1, "Output buffers in net input queue?");
946 824
947 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 825 /* Deliver interrupt now, since we're about to sleep. */
948 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 826 if (vq->pending_used && will_block(net_info->tunfd))
949 head != dev->vq->vring.num ? "sent" : "discarded"); 827 trigger_irq(vq);
950 828
951 /* All good. */ 829 len = readv(net_info->tunfd, iov, in);
952 return true; 830 if (len <= 0)
831 err(1, "Failed to read from tun.");
832 add_used(vq, head, len);
953} 833}
954 834
955/*L:215 This is the callback attached to the network and console input 835/* This is the helper to create threads. */
956 * virtqueues: it ensures we try again, in case we stopped console or net 836static int do_thread(void *_vq)
957 * delivery because Guest didn't have any buffers. */
958static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
959{ 837{
960 add_device_fd(vq->dev->fd); 838 struct virtqueue *vq = _vq;
961 /* Snap the Waker out of its select loop. */ 839
962 write(waker_fds.pipe[1], "", 1); 840 for (;;)
841 vq->service(vq);
842 return 0;
963} 843}
964 844
965static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) 845/* When a child dies, we kill our entire process group with SIGTERM. This
846 * also has the side effect that the shell restores the console for us! */
847static void kill_launcher(int signal)
966{ 848{
967 /* We don't need to know again when Guest refills receive buffer. */ 849 kill(0, SIGTERM);
968 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
969 enable_fd(fd, vq, timeout);
970} 850}
971 851
972/* When the Guest tells us they updated the status field, we handle it. */ 852static void reset_device(struct device *dev)
973static void update_device_status(struct device *dev)
974{ 853{
975 struct virtqueue *vq; 854 struct virtqueue *vq;
976 855
977 /* This is a reset. */ 856 verbose("Resetting device %s\n", dev->name);
978 if (dev->desc->status == 0) {
979 verbose("Resetting device %s\n", dev->name);
980 857
981 /* Clear any features they've acked. */ 858 /* Clear any features they've acked. */
982 memset(get_feature_bits(dev) + dev->desc->feature_len, 0, 859 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
983 dev->desc->feature_len);
984 860
985 /* Zero out the virtqueues. */ 861 /* We're going to be explicitly killing threads, so ignore them. */
986 for (vq = dev->vq; vq; vq = vq->next) { 862 signal(SIGCHLD, SIG_IGN);
987 memset(vq->vring.desc, 0, 863
988 vring_size(vq->config.num, LGUEST_VRING_ALIGN)); 864 /* Zero out the virtqueues, get rid of their threads */
989 lg_last_avail(vq) = 0; 865 for (vq = dev->vq; vq; vq = vq->next) {
866 if (vq->thread != (pid_t)-1) {
867 kill(vq->thread, SIGTERM);
868 waitpid(vq->thread, NULL, 0);
869 vq->thread = (pid_t)-1;
990 } 870 }
991 } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 871 memset(vq->vring.desc, 0,
872 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
873 lg_last_avail(vq) = 0;
874 }
875 dev->running = false;
876
877 /* Now we care if threads die. */
878 signal(SIGCHLD, (void *)kill_launcher);
879}
880
881static void create_thread(struct virtqueue *vq)
882{
883 /* Create stack for thread and run it. Since stack grows
884 * upwards, we point the stack pointer to the end of this
885 * region. */
886 char *stack = malloc(32768);
887 unsigned long args[] = { LHREQ_EVENTFD,
888 vq->config.pfn*getpagesize(), 0 };
889
890 /* Create a zero-initialized eventfd. */
891 vq->eventfd = eventfd(0, 0);
892 if (vq->eventfd < 0)
893 err(1, "Creating eventfd");
894 args[2] = vq->eventfd;
895
896 /* Attach an eventfd to this virtqueue: it will go off
897 * when the Guest does an LHCALL_NOTIFY for this vq. */
898 if (write(lguest_fd, &args, sizeof(args)) != 0)
899 err(1, "Attaching eventfd");
900
901 /* CLONE_VM: because it has to access the Guest memory, and
902 * SIGCHLD so we get a signal if it dies. */
903 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
904 if (vq->thread == (pid_t)-1)
905 err(1, "Creating clone");
906 /* We close our local copy, now the child has it. */
907 close(vq->eventfd);
908}
909
910static void start_device(struct device *dev)
911{
912 unsigned int i;
913 struct virtqueue *vq;
914
915 verbose("Device %s OK: offered", dev->name);
916 for (i = 0; i < dev->feature_len; i++)
917 verbose(" %02x", get_feature_bits(dev)[i]);
918 verbose(", accepted");
919 for (i = 0; i < dev->feature_len; i++)
920 verbose(" %02x", get_feature_bits(dev)
921 [dev->feature_len+i]);
922
923 for (vq = dev->vq; vq; vq = vq->next) {
924 if (vq->service)
925 create_thread(vq);
926 }
927 dev->running = true;
928}
929
930static void cleanup_devices(void)
931{
932 struct device *dev;
933
934 for (dev = devices.dev; dev; dev = dev->next)
935 reset_device(dev);
936
937 /* If we saved off the original terminal settings, restore them now. */
938 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
939 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
940}
941
942/* When the Guest tells us they updated the status field, we handle it. */
943static void update_device_status(struct device *dev)
944{
945 /* A zero status is a reset, otherwise it's a set of flags. */
946 if (dev->desc->status == 0)
947 reset_device(dev);
948 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
992 warnx("Device %s configuration FAILED", dev->name); 949 warnx("Device %s configuration FAILED", dev->name);
950 if (dev->running)
951 reset_device(dev);
993 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 952 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
994 unsigned int i; 953 if (!dev->running)
995 954 start_device(dev);
996 verbose("Device %s OK: offered", dev->name);
997 for (i = 0; i < dev->desc->feature_len; i++)
998 verbose(" %02x", get_feature_bits(dev)[i]);
999 verbose(", accepted");
1000 for (i = 0; i < dev->desc->feature_len; i++)
1001 verbose(" %02x", get_feature_bits(dev)
1002 [dev->desc->feature_len+i]);
1003
1004 if (dev->ready)
1005 dev->ready(dev);
1006 } 955 }
1007} 956}
1008 957
1009/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 958/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
1010static void handle_output(int fd, unsigned long addr) 959static void handle_output(unsigned long addr)
1011{ 960{
1012 struct device *i; 961 struct device *i;
1013 struct virtqueue *vq;
1014 962
1015 /* Check each device and virtqueue. */ 963 /* Check each device. */
1016 for (i = devices.dev; i; i = i->next) { 964 for (i = devices.dev; i; i = i->next) {
965 struct virtqueue *vq;
966
1017 /* Notifications to device descriptors update device status. */ 967 /* Notifications to device descriptors update device status. */
1018 if (from_guest_phys(addr) == i->desc) { 968 if (from_guest_phys(addr) == i->desc) {
1019 update_device_status(i); 969 update_device_status(i);
1020 return; 970 return;
1021 } 971 }
1022 972
1023 /* Notifications to virtqueues mean output has occurred. */ 973 /* Devices *can* be used before status is set to DRIVER_OK. */
1024 for (vq = i->vq; vq; vq = vq->next) { 974 for (vq = i->vq; vq; vq = vq->next) {
1025 if (vq->config.pfn != addr/getpagesize()) 975 if (addr != vq->config.pfn*getpagesize())
1026 continue; 976 continue;
1027 977 if (i->running)
1028 /* Guest should acknowledge (and set features!) before 978 errx(1, "Notification on running %s", i->name);
1029 * using the device. */ 979 start_device(i);
1030 if (i->desc->status == 0) {
1031 warnx("%s gave early output", i->name);
1032 return;
1033 }
1034
1035 if (strcmp(vq->dev->name, "console") != 0)
1036 verbose("Output to %s\n", vq->dev->name);
1037 if (vq->handle_output)
1038 vq->handle_output(fd, vq, false);
1039 return; 980 return;
1040 } 981 }
1041 } 982 }
@@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr)
1049 strnlen(from_guest_phys(addr), guest_limit - addr)); 990 strnlen(from_guest_phys(addr), guest_limit - addr));
1050} 991}
1051 992
1052static void handle_timeout(int fd)
1053{
1054 char buf[32];
1055 struct device *i;
1056 struct virtqueue *vq;
1057
1058 /* Clear the pipe */
1059 read(timeoutpipe[0], buf, sizeof(buf));
1060
1061 /* Check each device and virtqueue: flush blocked ones. */
1062 for (i = devices.dev; i; i = i->next) {
1063 for (vq = i->vq; vq; vq = vq->next) {
1064 if (!vq->blocked)
1065 continue;
1066
1067 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
1068 vq->blocked = false;
1069 if (vq->handle_output)
1070 vq->handle_output(fd, vq, true);
1071 }
1072 }
1073}
1074
1075/* This is called when the Waker wakes us up: check for incoming file
1076 * descriptors. */
1077static void handle_input(int fd)
1078{
1079 /* select() wants a zeroed timeval to mean "don't wait". */
1080 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
1081
1082 for (;;) {
1083 struct device *i;
1084 fd_set fds = devices.infds;
1085 int num;
1086
1087 num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
1088 /* Could get interrupted */
1089 if (num < 0)
1090 continue;
1091 /* If nothing is ready, we're done. */
1092 if (num == 0)
1093 break;
1094
1095 /* Otherwise, call the device(s) which have readable file
1096 * descriptors and a method of handling them. */
1097 for (i = devices.dev; i; i = i->next) {
1098 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
1099 if (i->handle_input(fd, i))
1100 continue;
1101
1102 /* If handle_input() returns false, it means we
1103 * should no longer service it. Networking and
1104 * console do this when there's no input
1105 * buffers to deliver into. Console also uses
1106 * it when it discovers that stdin is closed. */
1107 FD_CLR(i->fd, &devices.infds);
1108 }
1109 }
1110
1111 /* Is this the timeout fd? */
1112 if (FD_ISSET(timeoutpipe[0], &fds))
1113 handle_timeout(fd);
1114 }
1115}
1116
1117/*L:190 993/*L:190
1118 * Device Setup 994 * Device Setup
1119 * 995 *
@@ -1129,8 +1005,8 @@ static void handle_input(int fd)
1129static u8 *device_config(const struct device *dev) 1005static u8 *device_config(const struct device *dev)
1130{ 1006{
1131 return (void *)(dev->desc + 1) 1007 return (void *)(dev->desc + 1)
1132 + dev->desc->num_vq * sizeof(struct lguest_vqconfig) 1008 + dev->num_vq * sizeof(struct lguest_vqconfig)
1133 + dev->desc->feature_len * 2; 1009 + dev->feature_len * 2;
1134} 1010}
1135 1011
1136/* This routine allocates a new "struct lguest_device_desc" from descriptor 1012/* This routine allocates a new "struct lguest_device_desc" from descriptor
@@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1159/* Each device descriptor is followed by the description of its virtqueues. We 1035/* Each device descriptor is followed by the description of its virtqueues. We
1160 * specify how many descriptors the virtqueue is to have. */ 1036 * specify how many descriptors the virtqueue is to have. */
1161static void add_virtqueue(struct device *dev, unsigned int num_descs, 1037static void add_virtqueue(struct device *dev, unsigned int num_descs,
1162 void (*handle_output)(int, struct virtqueue *, bool)) 1038 void (*service)(struct virtqueue *))
1163{ 1039{
1164 unsigned int pages; 1040 unsigned int pages;
1165 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1041 struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1174 vq->next = NULL; 1050 vq->next = NULL;
1175 vq->last_avail_idx = 0; 1051 vq->last_avail_idx = 0;
1176 vq->dev = dev; 1052 vq->dev = dev;
1177 vq->inflight = 0; 1053 vq->service = service;
1178 vq->blocked = false; 1054 vq->thread = (pid_t)-1;
1179 1055
1180 /* Initialize the configuration. */ 1056 /* Initialize the configuration. */
1181 vq->config.num = num_descs; 1057 vq->config.num = num_descs;
@@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1191 * yet, otherwise we'd be overwriting them. */ 1067 * yet, otherwise we'd be overwriting them. */
1192 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1068 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1193 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1069 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1070 dev->num_vq++;
1194 dev->desc->num_vq++; 1071 dev->desc->num_vq++;
1195 1072
1196 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1199 * second. */ 1076 * second. */
1200 for (i = &dev->vq; *i; i = &(*i)->next); 1077 for (i = &dev->vq; *i; i = &(*i)->next);
1201 *i = vq; 1078 *i = vq;
1202
1203 /* Set the routine to call when the Guest does something to this
1204 * virtqueue. */
1205 vq->handle_output = handle_output;
1206
1207 /* As an optimization, set the advisory "Don't Notify Me" flag if we
1208 * don't have a handler */
1209 if (!handle_output)
1210 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1211} 1079}
1212 1080
1213/* The first half of the feature bitmask is for us to advertise features. The 1081/* The first half of the feature bitmask is for us to advertise features. The
@@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit)
1219 /* We can't extend the feature bits once we've added config bytes */ 1087 /* We can't extend the feature bits once we've added config bytes */
1220 if (dev->desc->feature_len <= bit / CHAR_BIT) { 1088 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1221 assert(dev->desc->config_len == 0); 1089 assert(dev->desc->config_len == 0);
1222 dev->desc->feature_len = (bit / CHAR_BIT) + 1; 1090 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1223 } 1091 }
1224 1092
1225 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1243 * calling new_dev_desc() to allocate the descriptor and device memory. 1111 * calling new_dev_desc() to allocate the descriptor and device memory.
1244 * 1112 *
1245 * See what I mean about userspace being boring? */ 1113 * See what I mean about userspace being boring? */
1246static struct device *new_device(const char *name, u16 type, int fd, 1114static struct device *new_device(const char *name, u16 type)
1247 bool (*handle_input)(int, struct device *))
1248{ 1115{
1249 struct device *dev = malloc(sizeof(*dev)); 1116 struct device *dev = malloc(sizeof(*dev));
1250 1117
1251 /* Now we populate the fields one at a time. */ 1118 /* Now we populate the fields one at a time. */
1252 dev->fd = fd;
1253 /* If we have an input handler for this file descriptor, then we add it
1254 * to the device_list's fdset and maxfd. */
1255 if (handle_input)
1256 add_device_fd(dev->fd);
1257 dev->desc = new_dev_desc(type); 1119 dev->desc = new_dev_desc(type);
1258 dev->handle_input = handle_input;
1259 dev->name = name; 1120 dev->name = name;
1260 dev->vq = NULL; 1121 dev->vq = NULL;
1261 dev->ready = NULL; 1122 dev->feature_len = 0;
1123 dev->num_vq = 0;
1124 dev->running = false;
1262 1125
1263 /* Append to device list. Prepending to a single-linked list is 1126 /* Append to device list. Prepending to a single-linked list is
1264 * easier, but the user expects the devices to be arranged on the bus 1127 * easier, but the user expects the devices to be arranged on the bus
@@ -1286,13 +1149,10 @@ static void setup_console(void)
1286 * raw input stream to the Guest. */ 1149 * raw input stream to the Guest. */
1287 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1150 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1288 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1151 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1289 /* If we exit gracefully, the original settings will be
1290 * restored so the user can see what they're typing. */
1291 atexit(restore_term);
1292 } 1152 }
1293 1153
1294 dev = new_device("console", VIRTIO_ID_CONSOLE, 1154 dev = new_device("console", VIRTIO_ID_CONSOLE);
1295 STDIN_FILENO, handle_console_input); 1155
1296 /* We store the console state in dev->priv, and initialize it. */ 1156 /* We store the console state in dev->priv, and initialize it. */
1297 dev->priv = malloc(sizeof(struct console_abort)); 1157 dev->priv = malloc(sizeof(struct console_abort));
1298 ((struct console_abort *)dev->priv)->count = 0; 1158 ((struct console_abort *)dev->priv)->count = 0;
@@ -1301,31 +1161,13 @@ static void setup_console(void)
1301 * they put something the input queue, we make sure we're listening to 1161 * they put something the input queue, we make sure we're listening to
1302 * stdin. When they put something in the output queue, we write it to 1162 * stdin. When they put something in the output queue, we write it to
1303 * stdout. */ 1163 * stdout. */
1304 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1164 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1305 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1165 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1306 1166
1307 verbose("device %u: console\n", devices.device_num++); 1167 verbose("device %u: console\n", ++devices.device_num);
1308} 1168}
1309/*:*/ 1169/*:*/
1310 1170
1311static void timeout_alarm(int sig)
1312{
1313 write(timeoutpipe[1], "", 1);
1314}
1315
1316static void setup_timeout(void)
1317{
1318 if (pipe(timeoutpipe) != 0)
1319 err(1, "Creating timeout pipe");
1320
1321 if (fcntl(timeoutpipe[1], F_SETFL,
1322 fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
1323 err(1, "Making timeout pipe nonblocking");
1324
1325 add_device_fd(timeoutpipe[0]);
1326 signal(SIGALRM, timeout_alarm);
1327}
1328
1329/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1171/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1330 * --sharenet=<name> option which opens or creates a named pipe. This can be 1172 * --sharenet=<name> option which opens or creates a named pipe. This can be
1331 * used to send packets to another guest in a 1:1 manner. 1173 * used to send packets to another guest in a 1:1 manner.
@@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1447static void setup_tun_net(char *arg) 1289static void setup_tun_net(char *arg)
1448{ 1290{
1449 struct device *dev; 1291 struct device *dev;
1450 int netfd, ipfd; 1292 struct net_info *net_info = malloc(sizeof(*net_info));
1293 int ipfd;
1451 u32 ip = INADDR_ANY; 1294 u32 ip = INADDR_ANY;
1452 bool bridging = false; 1295 bool bridging = false;
1453 char tapif[IFNAMSIZ], *p; 1296 char tapif[IFNAMSIZ], *p;
1454 struct virtio_net_config conf; 1297 struct virtio_net_config conf;
1455 1298
1456 netfd = get_tun_device(tapif); 1299 net_info->tunfd = get_tun_device(tapif);
1457 1300
1458 /* First we create a new network device. */ 1301 /* First we create a new network device. */
1459 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1302 dev = new_device("net", VIRTIO_ID_NET);
1303 dev->priv = net_info;
1460 1304
1461 /* Network devices need a receive and a send queue, just like 1305 /* Network devices need a receive and a send queue, just like
1462 * console. */ 1306 * console. */
1463 add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); 1307 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1464 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1308 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1465 1309
1466 /* We need a socket to perform the magic network ioctls to bring up the 1310 /* We need a socket to perform the magic network ioctls to bring up the
1467 * tap interface, connect to the bridge etc. Any socket will do! */ 1311 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg)
1502 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 1346 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1503 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 1347 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1504 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 1348 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1349 /* We handle indirect ring entries */
1350 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1505 set_config(dev, sizeof(conf), &conf); 1351 set_config(dev, sizeof(conf), &conf);
1506 1352
1507 /* We don't need the socket any more; setup is done. */ 1353 /* We don't need the socket any more; setup is done. */
@@ -1550,20 +1396,18 @@ struct vblk_info
1550 * Remember that the block device is handled by a separate I/O thread. We head 1396 * Remember that the block device is handled by a separate I/O thread. We head
1551 * straight into the core of that thread here: 1397 * straight into the core of that thread here:
1552 */ 1398 */
1553static bool service_io(struct device *dev) 1399static void blk_request(struct virtqueue *vq)
1554{ 1400{
1555 struct vblk_info *vblk = dev->priv; 1401 struct vblk_info *vblk = vq->dev->priv;
1556 unsigned int head, out_num, in_num, wlen; 1402 unsigned int head, out_num, in_num, wlen;
1557 int ret; 1403 int ret;
1558 u8 *in; 1404 u8 *in;
1559 struct virtio_blk_outhdr *out; 1405 struct virtio_blk_outhdr *out;
1560 struct iovec iov[dev->vq->vring.num]; 1406 struct iovec iov[vq->vring.num];
1561 off64_t off; 1407 off64_t off;
1562 1408
1563 /* See if there's a request waiting. If not, nothing to do. */ 1409 /* Get the next request. */
1564 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1410 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1565 if (head == dev->vq->vring.num)
1566 return false;
1567 1411
1568 /* Every block request should contain at least one output buffer 1412 /* Every block request should contain at least one output buffer
1569 * (detailing the location on disk and the type of request) and one 1413 * (detailing the location on disk and the type of request) and one
@@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev)
1637 if (out->type & VIRTIO_BLK_T_BARRIER) 1481 if (out->type & VIRTIO_BLK_T_BARRIER)
1638 fdatasync(vblk->fd); 1482 fdatasync(vblk->fd);
1639 1483
1640 /* We can't trigger an IRQ, because we're not the Launcher. It does 1484 add_used(vq, head, wlen);
1641 * that when we tell it we're done. */
1642 add_used(dev->vq, head, wlen);
1643 return true;
1644}
1645
1646/* This is the thread which actually services the I/O. */
1647static int io_thread(void *_dev)
1648{
1649 struct device *dev = _dev;
1650 struct vblk_info *vblk = dev->priv;
1651 char c;
1652
1653 /* Close other side of workpipe so we get 0 read when main dies. */
1654 close(vblk->workpipe[1]);
1655 /* Close the other side of the done_fd pipe. */
1656 close(dev->fd);
1657
1658 /* When this read fails, it means Launcher died, so we follow. */
1659 while (read(vblk->workpipe[0], &c, 1) == 1) {
1660 /* We acknowledge each request immediately to reduce latency,
1661 * rather than waiting until we've done them all. I haven't
1662 * measured to see if it makes any difference.
1663 *
1664 * That would be an interesting test, wouldn't it? You could
1665 * also try having more than one I/O thread. */
1666 while (service_io(dev))
1667 write(vblk->done_fd, &c, 1);
1668 }
1669 return 0;
1670}
1671
1672/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1673 * when that thread tells us it's completed some I/O. */
1674static bool handle_io_finish(int fd, struct device *dev)
1675{
1676 char c;
1677
1678 /* If the I/O thread died, presumably it printed the error, so we
1679 * simply exit. */
1680 if (read(dev->fd, &c, 1) != 1)
1681 exit(1);
1682
1683 /* It did some work, so trigger the irq. */
1684 trigger_irq(fd, dev->vq);
1685 return true;
1686}
1687
1688/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1689static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
1690{
1691 struct vblk_info *vblk = vq->dev->priv;
1692 char c = 0;
1693
1694 /* Wake up I/O thread and tell it to go to work! */
1695 if (write(vblk->workpipe[1], &c, 1) != 1)
1696 /* Presumably it indicated why it died. */
1697 exit(1);
1698} 1485}
1699 1486
1700/*L:198 This actually sets up a virtual block device. */ 1487/*L:198 This actually sets up a virtual block device. */
1701static void setup_block_file(const char *filename) 1488static void setup_block_file(const char *filename)
1702{ 1489{
1703 int p[2];
1704 struct device *dev; 1490 struct device *dev;
1705 struct vblk_info *vblk; 1491 struct vblk_info *vblk;
1706 void *stack;
1707 struct virtio_blk_config conf; 1492 struct virtio_blk_config conf;
1708 1493
1709 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1710 pipe(p);
1711
1712 /* The device responds to return from I/O thread. */ 1494 /* The device responds to return from I/O thread. */
1713 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1495 dev = new_device("block", VIRTIO_ID_BLOCK);
1714 1496
1715 /* The device has one virtqueue, where the Guest places requests. */ 1497 /* The device has one virtqueue, where the Guest places requests. */
1716 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1498 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1717 1499
1718 /* Allocate the room for our own bookkeeping */ 1500 /* Allocate the room for our own bookkeeping */
1719 vblk = dev->priv = malloc(sizeof(*vblk)); 1501 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename)
1735 1517
1736 set_config(dev, sizeof(conf), &conf); 1518 set_config(dev, sizeof(conf), &conf);
1737 1519
1738 /* The I/O thread writes to this end of the pipe when done. */
1739 vblk->done_fd = p[1];
1740
1741 /* This is the second pipe, which is how we tell the I/O thread about
1742 * more work. */
1743 pipe(vblk->workpipe);
1744
1745 /* Create stack for thread and run it. Since stack grows upwards, we
1746 * point the stack pointer to the end of this region. */
1747 stack = malloc(32768);
1748 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
1749 * becoming a zombie. */
1750 if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
1751 err(1, "Creating clone");
1752
1753 /* We don't need to keep the I/O thread's end of the pipes open. */
1754 close(vblk->done_fd);
1755 close(vblk->workpipe[0]);
1756
1757 verbose("device %u: virtblock %llu sectors\n", 1520 verbose("device %u: virtblock %llu sectors\n",
1758 devices.device_num, le64_to_cpu(conf.capacity)); 1521 ++devices.device_num, le64_to_cpu(conf.capacity));
1759} 1522}
1760 1523
1524struct rng_info {
1525 int rfd;
1526};
1527
1761/* Our random number generator device reads from /dev/random into the Guest's 1528/* Our random number generator device reads from /dev/random into the Guest's
1762 * input buffers. The usual case is that the Guest doesn't want random numbers 1529 * input buffers. The usual case is that the Guest doesn't want random numbers
1763 * and so has no buffers although /dev/random is still readable, whereas 1530 * and so has no buffers although /dev/random is still readable, whereas
1764 * console is the reverse. 1531 * console is the reverse.
1765 * 1532 *
1766 * The same logic applies, however. */ 1533 * The same logic applies, however. */
1767static bool handle_rng_input(int fd, struct device *dev) 1534static void rng_input(struct virtqueue *vq)
1768{ 1535{
1769 int len; 1536 int len;
1770 unsigned int head, in_num, out_num, totlen = 0; 1537 unsigned int head, in_num, out_num, totlen = 0;
1771 struct iovec iov[dev->vq->vring.num]; 1538 struct rng_info *rng_info = vq->dev->priv;
1539 struct iovec iov[vq->vring.num];
1772 1540
1773 /* First we need a buffer from the Guests's virtqueue. */ 1541 /* First we need a buffer from the Guests's virtqueue. */
1774 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1542 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1775
1776 /* If they're not ready for input, stop listening to this file
1777 * descriptor. We'll start again once they add an input buffer. */
1778 if (head == dev->vq->vring.num)
1779 return false;
1780
1781 if (out_num) 1543 if (out_num)
1782 errx(1, "Output buffers in rng?"); 1544 errx(1, "Output buffers in rng?");
1783 1545
@@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev)
1785 * it reads straight into the Guest's buffer. We loop to make sure we 1547 * it reads straight into the Guest's buffer. We loop to make sure we
1786 * fill it. */ 1548 * fill it. */
1787 while (!iov_empty(iov, in_num)) { 1549 while (!iov_empty(iov, in_num)) {
1788 len = readv(dev->fd, iov, in_num); 1550 len = readv(rng_info->rfd, iov, in_num);
1789 if (len <= 0) 1551 if (len <= 0)
1790 err(1, "Read from /dev/random gave %i", len); 1552 err(1, "Read from /dev/random gave %i", len);
1791 iov_consume(iov, in_num, len); 1553 iov_consume(iov, in_num, len);
@@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev)
1793 } 1555 }
1794 1556
1795 /* Tell the Guest about the new input. */ 1557 /* Tell the Guest about the new input. */
1796 add_used_and_trigger(fd, dev->vq, head, totlen); 1558 add_used(vq, head, totlen);
1797
1798 /* Everything went OK! */
1799 return true;
1800} 1559}
1801 1560
1802/* And this creates a "hardware" random number device for the Guest. */ 1561/* And this creates a "hardware" random number device for the Guest. */
1803static void setup_rng(void) 1562static void setup_rng(void)
1804{ 1563{
1805 struct device *dev; 1564 struct device *dev;
1806 int fd; 1565 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1807 1566
1808 fd = open_or_die("/dev/random", O_RDONLY); 1567 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1809 1568
1810 /* The device responds to return from I/O thread. */ 1569 /* The device responds to return from I/O thread. */
1811 dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); 1570 dev = new_device("rng", VIRTIO_ID_RNG);
1571 dev->priv = rng_info;
1812 1572
1813 /* The device has one virtqueue, where the Guest places inbufs. */ 1573 /* The device has one virtqueue, where the Guest places inbufs. */
1814 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1574 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1815 1575
1816 verbose("device %u: rng\n", devices.device_num++); 1576 verbose("device %u: rng\n", devices.device_num++);
1817} 1577}
@@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void)
1827 for (i = 3; i < FD_SETSIZE; i++) 1587 for (i = 3; i < FD_SETSIZE; i++)
1828 close(i); 1588 close(i);
1829 1589
1830 /* The exec automatically gets rid of the I/O and Waker threads. */ 1590 /* Reset all the devices (kills all threads). */
1591 cleanup_devices();
1592
1831 execv(main_args[0], main_args); 1593 execv(main_args[0], main_args);
1832 err(1, "Could not exec %s", main_args[0]); 1594 err(1, "Could not exec %s", main_args[0]);
1833} 1595}
1834 1596
1835/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1597/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
1836 * its input and output, and finally, lays it to rest. */ 1598 * its input and output, and finally, lays it to rest. */
1837static void __attribute__((noreturn)) run_guest(int lguest_fd) 1599static void __attribute__((noreturn)) run_guest(void)
1838{ 1600{
1839 for (;;) { 1601 for (;;) {
1840 unsigned long args[] = { LHREQ_BREAK, 0 };
1841 unsigned long notify_addr; 1602 unsigned long notify_addr;
1842 int readval; 1603 int readval;
1843 1604
@@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1848 /* One unsigned long means the Guest did HCALL_NOTIFY */ 1609 /* One unsigned long means the Guest did HCALL_NOTIFY */
1849 if (readval == sizeof(notify_addr)) { 1610 if (readval == sizeof(notify_addr)) {
1850 verbose("Notify on address %#lx\n", notify_addr); 1611 verbose("Notify on address %#lx\n", notify_addr);
1851 handle_output(lguest_fd, notify_addr); 1612 handle_output(notify_addr);
1852 continue;
1853 /* ENOENT means the Guest died. Reading tells us why. */ 1613 /* ENOENT means the Guest died. Reading tells us why. */
1854 } else if (errno == ENOENT) { 1614 } else if (errno == ENOENT) {
1855 char reason[1024] = { 0 }; 1615 char reason[1024] = { 0 };
@@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1858 /* ERESTART means that we need to reboot the guest */ 1618 /* ERESTART means that we need to reboot the guest */
1859 } else if (errno == ERESTART) { 1619 } else if (errno == ERESTART) {
1860 restart_guest(); 1620 restart_guest();
1861 /* EAGAIN means a signal (timeout). 1621 /* Anything else means a bug or incompatible change. */
1862 * Anything else means a bug or incompatible change. */ 1622 } else
1863 } else if (errno != EAGAIN)
1864 err(1, "Running guest failed"); 1623 err(1, "Running guest failed");
1865
1866 /* Only service input on thread for CPU 0. */
1867 if (cpu_id != 0)
1868 continue;
1869
1870 /* Service input, then unset the BREAK to release the Waker. */
1871 handle_input(lguest_fd);
1872 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1873 err(1, "Resetting break");
1874 } 1624 }
1875} 1625}
1876/*L:240 1626/*L:240
@@ -1904,8 +1654,8 @@ int main(int argc, char *argv[])
1904 /* Memory, top-level pagetable, code startpoint and size of the 1654 /* Memory, top-level pagetable, code startpoint and size of the
1905 * (optional) initrd. */ 1655 * (optional) initrd. */
1906 unsigned long mem = 0, start, initrd_size = 0; 1656 unsigned long mem = 0, start, initrd_size = 0;
1907 /* Two temporaries and the /dev/lguest file descriptor. */ 1657 /* Two temporaries. */
1908 int i, c, lguest_fd; 1658 int i, c;
1909 /* The boot information for the Guest. */ 1659 /* The boot information for the Guest. */
1910 struct boot_params *boot; 1660 struct boot_params *boot;
1911 /* If they specify an initrd file to load. */ 1661 /* If they specify an initrd file to load. */
@@ -1913,18 +1663,10 @@ int main(int argc, char *argv[])
1913 1663
1914 /* Save the args: we "reboot" by execing ourselves again. */ 1664 /* Save the args: we "reboot" by execing ourselves again. */
1915 main_args = argv; 1665 main_args = argv;
1916 /* We don't "wait" for the children, so prevent them from becoming
1917 * zombies. */
1918 signal(SIGCHLD, SIG_IGN);
1919 1666
1920 /* First we initialize the device list. Since console and network 1667 /* First we initialize the device list. We keep a pointer to the last
1921 * device receive input from a file descriptor, we keep an fdset 1668 * device, and the next interrupt number to use for devices (1:
1922 * (infds) and the maximum fd number (max_infd) with the head of the 1669 * remember that 0 is used by the timer). */
1923 * list. We also keep a pointer to the last device. Finally, we keep
1924 * the next interrupt number to use for devices (1: remember that 0 is
1925 * used by the timer). */
1926 FD_ZERO(&devices.infds);
1927 devices.max_infd = -1;
1928 devices.lastdev = NULL; 1670 devices.lastdev = NULL;
1929 devices.next_irq = 1; 1671 devices.next_irq = 1;
1930 1672
@@ -1982,9 +1724,6 @@ int main(int argc, char *argv[])
1982 /* We always have a console device */ 1724 /* We always have a console device */
1983 setup_console(); 1725 setup_console();
1984 1726
1985 /* We can timeout waiting for Guest network transmit. */
1986 setup_timeout();
1987
1988 /* Now we load the kernel */ 1727 /* Now we load the kernel */
1989 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1728 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1990 1729
@@ -2023,15 +1762,16 @@ int main(int argc, char *argv[])
2023 1762
2024 /* We tell the kernel to initialize the Guest: this returns the open 1763 /* We tell the kernel to initialize the Guest: this returns the open
2025 * /dev/lguest file descriptor. */ 1764 * /dev/lguest file descriptor. */
2026 lguest_fd = tell_kernel(start); 1765 tell_kernel(start);
1766
1767 /* Ensure that we terminate if a child dies. */
1768 signal(SIGCHLD, kill_launcher);
2027 1769
2028 /* We clone off a thread, which wakes the Launcher whenever one of the 1770 /* If we exit via err(), this kills all the threads, restores tty. */
2029 * input file descriptors needs attention. We call this the Waker, and 1771 atexit(cleanup_devices);
2030 * we'll cover it in a moment. */
2031 setup_waker(lguest_fd);
2032 1772
2033 /* Finally, run the Guest. This doesn't return. */ 1773 /* Finally, run the Guest. This doesn't return. */
2034 run_guest(lguest_fd); 1774 run_guest();
2035} 1775}
2036/*:*/ 1776/*:*/
2037 1777
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 28c747362f95..efb3a6a045a2 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@ Running Lguest:
37 "Paravirtualized guest support" = Y 37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y 38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB 39 "High Memory Support" = off/4GB
40 "PAE (Physical Address Extension) Support" = N
41 "Alignment value to which kernel should be aligned" = 0x100000 40 "Alignment value to which kernel should be aligned" = 0x100000
42 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
43 CONFIG_PHYSICAL_ALIGN=0x100000) 42 CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
index 23045b8b50f0..300da4bdfdbd 100644
--- a/Documentation/local_ops.txt
+++ b/Documentation/local_ops.txt
@@ -34,7 +34,7 @@ out of order wrt other memory writes by the owner CPU.
34 34
35It can be done by slightly modifying the standard atomic operations : only 35It can be done by slightly modifying the standard atomic operations : only
36their UP variant must be kept. It typically means removing LOCK prefix (on 36their UP variant must be kept. It typically means removing LOCK prefix (on
37i386 and x86_64) and any SMP sychronization barrier. If the architecture does 37i386 and x86_64) and any SMP synchronization barrier. If the architecture does
38not have a different behavior between SMP and UP, including asm-generic/local.h 38not have a different behavior between SMP and UP, including asm-generic/local.h
39in your architecture's local.h is sufficient. 39in your architecture's local.h is sufficient.
40 40
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f5b7127f54ac..7f5809eddee6 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
31 31
32 - Locking functions. 32 - Locking functions.
33 - Interrupt disabling functions. 33 - Interrupt disabling functions.
34 - Sleep and wake-up functions.
34 - Miscellaneous functions. 35 - Miscellaneous functions.
35 36
36 (*) Inter-CPU locking barrier effects. 37 (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
1217other means. 1218other means.
1218 1219
1219 1220
1221SLEEP AND WAKE-UP FUNCTIONS
1222---------------------------
1223
1224Sleeping and waking on an event flagged in global data can be viewed as an
1225interaction between two pieces of data: the task state of the task waiting for
1226the event and the global data used to indicate the event. To make sure that
1227these appear to happen in the right order, the primitives to begin the process
1228of going to sleep, and the primitives to initiate a wake up imply certain
1229barriers.
1230
1231Firstly, the sleeper normally follows something like this sequence of events:
1232
1233 for (;;) {
1234 set_current_state(TASK_UNINTERRUPTIBLE);
1235 if (event_indicated)
1236 break;
1237 schedule();
1238 }
1239
1240A general memory barrier is interpolated automatically by set_current_state()
1241after it has altered the task state:
1242
1243 CPU 1
1244 ===============================
1245 set_current_state();
1246 set_mb();
1247 STORE current->state
1248 <general barrier>
1249 LOAD event_indicated
1250
1251set_current_state() may be wrapped by:
1252
1253 prepare_to_wait();
1254 prepare_to_wait_exclusive();
1255
1256which therefore also imply a general memory barrier after setting the state.
1257The whole sequence above is available in various canned forms, all of which
1258interpolate the memory barrier in the right place:
1259
1260 wait_event();
1261 wait_event_interruptible();
1262 wait_event_interruptible_exclusive();
1263 wait_event_interruptible_timeout();
1264 wait_event_killable();
1265 wait_event_timeout();
1266 wait_on_bit();
1267 wait_on_bit_lock();
1268
1269
1270Secondly, code that performs a wake up normally follows something like this:
1271
1272 event_indicated = 1;
1273 wake_up(&event_wait_queue);
1274
1275or:
1276
1277 event_indicated = 1;
1278 wake_up_process(event_daemon);
1279
1280A write memory barrier is implied by wake_up() and co. if and only if they wake
1281something up. The barrier occurs before the task state is cleared, and so sits
1282between the STORE to indicate the event and the STORE to set TASK_RUNNING:
1283
1284 CPU 1 CPU 2
1285 =============================== ===============================
1286 set_current_state(); STORE event_indicated
1287 set_mb(); wake_up();
1288 STORE current->state <write barrier>
1289 <general barrier> STORE current->state
1290 LOAD event_indicated
1291
1292The available waker functions include:
1293
1294 complete();
1295 wake_up();
1296 wake_up_all();
1297 wake_up_bit();
1298 wake_up_interruptible();
1299 wake_up_interruptible_all();
1300 wake_up_interruptible_nr();
1301 wake_up_interruptible_poll();
1302 wake_up_interruptible_sync();
1303 wake_up_interruptible_sync_poll();
1304 wake_up_locked();
1305 wake_up_locked_poll();
1306 wake_up_nr();
1307 wake_up_poll();
1308 wake_up_process();
1309
1310
1311[!] Note that the memory barriers implied by the sleeper and the waker do _not_
1312order multiple stores before the wake-up with respect to loads of those stored
1313values after the sleeper has called set_current_state(). For instance, if the
1314sleeper does:
1315
1316 set_current_state(TASK_INTERRUPTIBLE);
1317 if (event_indicated)
1318 break;
1319 __set_current_state(TASK_RUNNING);
1320 do_something(my_data);
1321
1322and the waker does:
1323
1324 my_data = value;
1325 event_indicated = 1;
1326 wake_up(&event_wait_queue);
1327
1328there's no guarantee that the change to event_indicated will be perceived by
1329the sleeper as coming after the change to my_data. In such a circumstance, the
1330code on both sides must interpolate its own memory barriers between the
1331separate data accesses. Thus the above sleeper ought to do:
1332
1333 set_current_state(TASK_INTERRUPTIBLE);
1334 if (event_indicated) {
1335 smp_rmb();
1336 do_something(my_data);
1337 }
1338
1339and the waker should do:
1340
1341 my_data = value;
1342 smp_wmb();
1343 event_indicated = 1;
1344 wake_up(&event_wait_queue);
1345
1346
1220MISCELLANEOUS FUNCTIONS 1347MISCELLANEOUS FUNCTIONS
1221----------------------- 1348-----------------------
1222 1349
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
1366 1493
1367Under normal operation, memory operation reordering is generally not going to 1494Under normal operation, memory operation reordering is generally not going to
1368be a problem as a single-threaded linear piece of code will still appear to 1495be a problem as a single-threaded linear piece of code will still appear to
1369work correctly, even if it's in an SMP kernel. There are, however, three 1496work correctly, even if it's in an SMP kernel. There are, however, four
1370circumstances in which reordering definitely _could_ be a problem: 1497circumstances in which reordering definitely _could_ be a problem:
1371 1498
1372 (*) Interprocessor interaction. 1499 (*) Interprocessor interaction.
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 4c2ecf537a4a..bbc8a6a36921 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -73,13 +73,13 @@ this phase is triggered automatically. ACPI can notify this event. If not,
73(see Section 4.). 73(see Section 4.).
74 74
75Logical Memory Hotplug phase is to change memory state into 75Logical Memory Hotplug phase is to change memory state into
76avaiable/unavailable for users. Amount of memory from user's view is 76available/unavailable for users. Amount of memory from user's view is
77changed by this phase. The kernel makes all memory in it as free pages 77changed by this phase. The kernel makes all memory in it as free pages
78when a memory range is available. 78when a memory range is available.
79 79
80In this document, this phase is described as online/offline. 80In this document, this phase is described as online/offline.
81 81
82Logical Memory Hotplug phase is triggred by write of sysfs file by system 82Logical Memory Hotplug phase is triggered by write of sysfs file by system
83administrator. For the hot-add case, it must be executed after Physical Hotplug 83administrator. For the hot-add case, it must be executed after Physical Hotplug
84phase by hand. 84phase by hand.
85(However, if you writes udev's hotplug scripts for memory hotplug, these 85(However, if you writes udev's hotplug scripts for memory hotplug, these
@@ -334,7 +334,7 @@ MEMORY_CANCEL_ONLINE
334 Generated if MEMORY_GOING_ONLINE fails. 334 Generated if MEMORY_GOING_ONLINE fails.
335 335
336MEMORY_ONLINE 336MEMORY_ONLINE
337 Generated when memory has succesfully brought online. The callback may 337 Generated when memory has successfully brought online. The callback may
338 allocate pages from the new memory. 338 allocate pages from the new memory.
339 339
340MEMORY_GOING_OFFLINE 340MEMORY_GOING_OFFLINE
@@ -359,7 +359,7 @@ The third argument is passed by pointer of struct memory_notify.
359struct memory_notify { 359struct memory_notify {
360 unsigned long start_pfn; 360 unsigned long start_pfn;
361 unsigned long nr_pages; 361 unsigned long nr_pages;
362 int status_cahnge_nid; 362 int status_change_nid;
363} 363}
364 364
365start_pfn is start_pfn of online/offline memory. 365start_pfn is start_pfn of online/offline memory.
diff --git a/Documentation/mn10300/ABI.txt b/Documentation/mn10300/ABI.txt
index 1fef1f06dfd2..d3507bad428d 100644
--- a/Documentation/mn10300/ABI.txt
+++ b/Documentation/mn10300/ABI.txt
@@ -26,7 +26,7 @@ registers and the stack. If the first argument is a 64-bit value, it will be
26passed in D0:D1. If the first argument is not a 64-bit value, but the second 26passed in D0:D1. If the first argument is not a 64-bit value, but the second
27is, the second will be passed entirely on the stack and D1 will be unused. 27is, the second will be passed entirely on the stack and D1 will be unused.
28 28
29Arguments smaller than 32-bits are not coelesced within a register or a stack 29Arguments smaller than 32-bits are not coalesced within a register or a stack
30word. For example, two byte-sized arguments will always be passed in separate 30word. For example, two byte-sized arguments will always be passed in separate
31registers or word-sized stack slots. 31registers or word-sized stack slots.
32 32
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
index bdf93b7f0f24..274821b35a7f 100644
--- a/Documentation/mtd/nand_ecc.txt
+++ b/Documentation/mtd/nand_ecc.txt
@@ -50,7 +50,7 @@ byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15
50 cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4 50 cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4
51 51
52This figure represents a sector of 256 bytes. 52This figure represents a sector of 256 bytes.
53cp is my abbreviaton for column parity, rp for row parity. 53cp is my abbreviation for column parity, rp for row parity.
54 54
55Let's start to explain column parity. 55Let's start to explain column parity.
56cp0 is the parity that belongs to all bit0, bit2, bit4, bit6. 56cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
@@ -560,7 +560,7 @@ Measuring this code again showed big gain. When executing the original
560linux code 1 million times, this took about 1 second on my system. 560linux code 1 million times, this took about 1 second on my system.
561(using time to measure the performance). After this iteration I was back 561(using time to measure the performance). After this iteration I was back
562to 0.075 sec. Actually I had to decide to start measuring over 10 562to 0.075 sec. Actually I had to decide to start measuring over 10
563million interations in order not to loose too much accuracy. This one 563million iterations in order not to lose too much accuracy. This one
564definitely seemed to be the jackpot! 564definitely seemed to be the jackpot!
565 565
566There is a little bit more room for improvement though. There are three 566There is a little bit more room for improvement though. There are three
@@ -571,8 +571,8 @@ loop; This eliminates 3 statements per loop. Of course after the loop we
571need to correct by adding: 571need to correct by adding:
572 rp4 ^= rp4_6; 572 rp4 ^= rp4_6;
573 rp6 ^= rp4_6 573 rp6 ^= rp4_6
574Furthermore there are 4 sequential assingments to rp8. This can be 574Furthermore there are 4 sequential assignments to rp8. This can be
575encoded slightly more efficient by saving tmppar before those 4 lines 575encoded slightly more efficiently by saving tmppar before those 4 lines
576and later do rp8 = rp8 ^ tmppar ^ notrp8; 576and later do rp8 = rp8 ^ tmppar ^ notrp8;
577(where notrp8 is the value of rp8 before those 4 lines). 577(where notrp8 is the value of rp8 before those 4 lines).
578Again a use of the commutative property of xor. 578Again a use of the commutative property of xor.
@@ -622,7 +622,7 @@ Not a big change, but every penny counts :-)
622Analysis 7 622Analysis 7
623========== 623==========
624 624
625Acutally this made things worse. Not very much, but I don't want to move 625Actually this made things worse. Not very much, but I don't want to move
626into the wrong direction. Maybe something to investigate later. Could 626into the wrong direction. Maybe something to investigate later. Could
627have to do with caching again. 627have to do with caching again.
628 628
@@ -642,7 +642,7 @@ Analysis 8
642This makes things worse. Let's stick with attempt 6 and continue from there. 642This makes things worse. Let's stick with attempt 6 and continue from there.
643Although it seems that the code within the loop cannot be optimised 643Although it seems that the code within the loop cannot be optimised
644further there is still room to optimize the generation of the ecc codes. 644further there is still room to optimize the generation of the ecc codes.
645We can simply calcualate the total parity. If this is 0 then rp4 = rp5 645We can simply calculate the total parity. If this is 0 then rp4 = rp5
646etc. If the parity is 1, then rp4 = !rp5; 646etc. If the parity is 1, then rp4 = !rp5;
647But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits 647But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
648in the result byte and then do something like 648in the result byte and then do something like
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 08762750f121..d5181ce9ff62 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -221,7 +221,7 @@ ad_select
221 221
222 - Any slave's 802.3ad association state changes 222 - Any slave's 802.3ad association state changes
223 223
224 - The bond's adminstrative state changes to up 224 - The bond's administrative state changes to up
225 225
226 count or 2 226 count or 2
227 227
@@ -369,7 +369,7 @@ fail_over_mac
369 When this policy is used in conjuction with the mii 369 When this policy is used in conjuction with the mii
370 monitor, devices which assert link up prior to being 370 monitor, devices which assert link up prior to being
371 able to actually transmit and receive are particularly 371 able to actually transmit and receive are particularly
372 susecptible to loss of the gratuitous ARP, and an 372 susceptible to loss of the gratuitous ARP, and an
373 appropriate updelay setting may be required. 373 appropriate updelay setting may be required.
374 374
375 follow or 2 375 follow or 2
@@ -1794,7 +1794,7 @@ target to query.
1794generally referred to as "trunk failover." This is a feature of the 1794generally referred to as "trunk failover." This is a feature of the
1795switch that causes the link state of a particular switch port to be set 1795switch that causes the link state of a particular switch port to be set
1796down (or up) when the state of another switch port goes down (or up). 1796down (or up) when the state of another switch port goes down (or up).
1797It's purpose is to propogate link failures from logically "exterior" ports 1797Its purpose is to propagate link failures from logically "exterior" ports
1798to the logically "interior" ports that bonding is able to monitor via 1798to the logically "interior" ports that bonding is able to monitor via
1799miimon. Availability and configuration for trunk failover varies by 1799miimon. Availability and configuration for trunk failover varies by
1800switch, but this can be a viable alternative to the ARP monitor when using 1800switch, but this can be a viable alternative to the ARP monitor when using
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
index 2035bc4932f2..cd79735013f9 100644
--- a/Documentation/networking/can.txt
+++ b/Documentation/networking/can.txt
@@ -36,10 +36,15 @@ This file contains
36 6.2 local loopback of sent frames 36 6.2 local loopback of sent frames
37 6.3 CAN controller hardware filters 37 6.3 CAN controller hardware filters
38 6.4 The virtual CAN driver (vcan) 38 6.4 The virtual CAN driver (vcan)
39 6.5 currently supported CAN hardware 39 6.5 The CAN network device driver interface
40 6.6 todo 40 6.5.1 Netlink interface to set/get devices properties
41 6.5.2 Setting the CAN bit-timing
42 6.5.3 Starting and stopping the CAN network device
43 6.6 supported CAN hardware
41 44
42 7 Credits 45 7 Socket CAN resources
46
47 8 Credits
43 48
44============================================================================ 49============================================================================
45 50
@@ -234,6 +239,8 @@ solution for a couple of reasons:
234 the user application using the common CAN filter mechanisms. Inside 239 the user application using the common CAN filter mechanisms. Inside
235 this filter definition the (interested) type of errors may be 240 this filter definition the (interested) type of errors may be
236 selected. The reception of error frames is disabled by default. 241 selected. The reception of error frames is disabled by default.
242 The format of the CAN error frame is briefly decribed in the Linux
243 header file "include/linux/can/error.h".
237 244
2384. How to use Socket CAN 2454. How to use Socket CAN
239------------------------ 246------------------------
@@ -327,7 +334,7 @@ solution for a couple of reasons:
327 return 1; 334 return 1;
328 } 335 }
329 336
330 /* paraniod check ... */ 337 /* paranoid check ... */
331 if (nbytes < sizeof(struct can_frame)) { 338 if (nbytes < sizeof(struct can_frame)) {
332 fprintf(stderr, "read: incomplete CAN frame\n"); 339 fprintf(stderr, "read: incomplete CAN frame\n");
333 return 1; 340 return 1;
@@ -605,61 +612,213 @@ solution for a couple of reasons:
605 removal of vcan network devices can be managed with the ip(8) tool: 612 removal of vcan network devices can be managed with the ip(8) tool:
606 613
607 - Create a virtual CAN network interface: 614 - Create a virtual CAN network interface:
608 ip link add type vcan 615 $ ip link add type vcan
609 616
610 - Create a virtual CAN network interface with a specific name 'vcan42': 617 - Create a virtual CAN network interface with a specific name 'vcan42':
611 ip link add dev vcan42 type vcan 618 $ ip link add dev vcan42 type vcan
612 619
613 - Remove a (virtual CAN) network interface 'vcan42': 620 - Remove a (virtual CAN) network interface 'vcan42':
614 ip link del vcan42 621 $ ip link del vcan42
615 622
616 The tool 'vcan' from the SocketCAN SVN repository on BerliOS is obsolete. 623 6.5 The CAN network device driver interface
617 624
618 Virtual CAN network device creation in older Kernels: 625 The CAN network device driver interface provides a generic interface
619 In Linux Kernel versions < 2.6.24 the vcan driver creates 4 vcan 626 to setup, configure and monitor CAN network devices. The user can then
620 netdevices at module load time by default. This value can be changed 627 configure the CAN device, like setting the bit-timing parameters, via
621 with the module parameter 'numdev'. E.g. 'modprobe vcan numdev=8' 628 the netlink interface using the program "ip" from the "IPROUTE2"
622 629 utility suite. The following chapter describes briefly how to use it.
623 6.5 currently supported CAN hardware 630 Furthermore, the interface uses a common data structure and exports a
631 set of common functions, which all real CAN network device drivers
632 should use. Please have a look to the SJA1000 or MSCAN driver to
633 understand how to use them. The name of the module is can-dev.ko.
634
635 6.5.1 Netlink interface to set/get devices properties
636
637 The CAN device must be configured via netlink interface. The supported
638 netlink message types are defined and briefly described in
639 "include/linux/can/netlink.h". CAN link support for the program "ip"
640 of the IPROUTE2 utility suite is avaiable and it can be used as shown
641 below:
642
643 - Setting CAN device properties:
644
645 $ ip link set can0 type can help
646 Usage: ip link set DEVICE type can
647 [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
648 [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
649 phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
650
651 [ loopback { on | off } ]
652 [ listen-only { on | off } ]
653 [ triple-sampling { on | off } ]
654
655 [ restart-ms TIME-MS ]
656 [ restart ]
657
658 Where: BITRATE := { 1..1000000 }
659 SAMPLE-POINT := { 0.000..0.999 }
660 TQ := { NUMBER }
661 PROP-SEG := { 1..8 }
662 PHASE-SEG1 := { 1..8 }
663 PHASE-SEG2 := { 1..8 }
664 SJW := { 1..4 }
665 RESTART-MS := { 0 | NUMBER }
666
667 - Display CAN device details and statistics:
668
669 $ ip -details -statistics link show can0
670 2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
671 link/can
672 can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
673 bitrate 125000 sample_point 0.875
674 tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
675 sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
676 clock 8000000
677 re-started bus-errors arbit-lost error-warn error-pass bus-off
678 41 17457 0 41 42 41
679 RX: bytes packets errors dropped overrun mcast
680 140859 17608 17457 0 0 0
681 TX: bytes packets errors dropped carrier collsns
682 861 112 0 41 0 0
683
684 More info to the above output:
685
686 "<TRIPLE-SAMPLING>"
687 Shows the list of selected CAN controller modes: LOOPBACK,
688 LISTEN-ONLY, or TRIPLE-SAMPLING.
689
690 "state ERROR-ACTIVE"
691 The current state of the CAN controller: "ERROR-ACTIVE",
692 "ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
693
694 "restart-ms 100"
695 Automatic restart delay time. If set to a non-zero value, a
696 restart of the CAN controller will be triggered automatically
697 in case of a bus-off condition after the specified delay time
698 in milliseconds. By default it's off.
699
700 "bitrate 125000 sample_point 0.875"
701 Shows the real bit-rate in bits/sec and the sample-point in the
702 range 0.000..0.999. If the calculation of bit-timing parameters
703 is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
704 bit-timing can be defined by setting the "bitrate" argument.
705 Optionally the "sample-point" can be specified. By default it's
706 0.000 assuming CIA-recommended sample-points.
707
708 "tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
709 Shows the time quanta in ns, propagation segment, phase buffer
710 segment 1 and 2 and the synchronisation jump width in units of
711 tq. They allow to define the CAN bit-timing in a hardware
712 independent format as proposed by the Bosch CAN 2.0 spec (see
713 chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
714
715 "sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
716 clock 8000000"
717 Shows the bit-timing constants of the CAN controller, here the
718 "sja1000". The minimum and maximum values of the time segment 1
719 and 2, the synchronisation jump width in units of tq, the
720 bitrate pre-scaler and the CAN system clock frequency in Hz.
721 These constants could be used for user-defined (non-standard)
722 bit-timing calculation algorithms in user-space.
723
724 "re-started bus-errors arbit-lost error-warn error-pass bus-off"
725 Shows the number of restarts, bus and arbitration lost errors,
726 and the state changes to the error-warning, error-passive and
727 bus-off state. RX overrun errors are listed in the "overrun"
728 field of the standard network statistics.
729
730 6.5.2 Setting the CAN bit-timing
731
732 The CAN bit-timing parameters can always be defined in a hardware
733 independent format as proposed in the Bosch CAN 2.0 specification
734 specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
735 and "sjw":
736
737 $ ip link set canX type can tq 125 prop-seg 6 \
738 phase-seg1 7 phase-seg2 2 sjw 1
739
740 If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
741 recommended CAN bit-timing parameters will be calculated if the bit-
742 rate is specified with the argument "bitrate":
743
744 $ ip link set canX type can bitrate 125000
745
746 Note that this works fine for the most common CAN controllers with
747 standard bit-rates but may *fail* for exotic bit-rates or CAN system
748 clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
749 space and allows user-space tools to solely determine and set the
750 bit-timing parameters. The CAN controller specific bit-timing
751 constants can be used for that purpose. They are listed by the
752 following command:
753
754 $ ip -details link show can0
755 ...
756 sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
757
758 6.5.3 Starting and stopping the CAN network device
759
760 A CAN network device is started or stopped as usual with the command
761 "ifconfig canX up/down" or "ip link set canX up/down". Be aware that
762 you *must* define proper bit-timing parameters for real CAN devices
763 before you can start it to avoid error-prone default settings:
764
765 $ ip link set canX up type can bitrate 125000
766
767 A device may enter the "bus-off" state if too much errors occurred on
768 the CAN bus. Then no more messages are received or sent. An automatic
769 bus-off recovery can be enabled by setting the "restart-ms" to a
770 non-zero value, e.g.:
771
772 $ ip link set canX type can restart-ms 100
773
774 Alternatively, the application may realize the "bus-off" condition
775 by monitoring CAN error frames and do a restart when appropriate with
776 the command:
777
778 $ ip link set canX type can restart
779
780 Note that a restart will also create a CAN error frame (see also
781 chapter 3.4).
624 782
625 On the project website http://developer.berlios.de/projects/socketcan 783 6.6 Supported CAN hardware
626 there are different drivers available:
627 784
628 vcan: Virtual CAN interface driver (if no real hardware is available) 785 Please check the "Kconfig" file in "drivers/net/can" to get an actual
629 sja1000: Philips SJA1000 CAN controller (recommended) 786 list of the support CAN hardware. On the Socket CAN project website
630 i82527: Intel i82527 CAN controller 787 (see chapter 7) there might be further drivers available, also for
631 mscan: Motorola/Freescale CAN controller (e.g. inside SOC MPC5200) 788 older kernel versions.
632 ccan: CCAN controller core (e.g. inside SOC h7202)
633 slcan: For a bunch of CAN adaptors that are attached via a
634 serial line ASCII protocol (for serial / USB adaptors)
635 789
636 Additionally the different CAN adaptors (ISA/PCI/PCMCIA/USB/Parport) 7907. Socket CAN resources
637 from PEAK Systemtechnik support the CAN netdevice driver model 791-----------------------
638 since Linux driver v6.0: http://www.peak-system.com/linux/index.htm
639 792
640 Please check the Mailing Lists on the berlios OSS project website. 793 You can find further resources for Socket CAN like user space tools,
794 support for old kernel versions, more drivers, mailing lists, etc.
795 at the BerliOS OSS project website for Socket CAN:
641 796
642 6.6 todo 797 http://developer.berlios.de/projects/socketcan
643 798
644 The configuration interface for CAN network drivers is still an open 799 If you have questions, bug fixes, etc., don't hesitate to post them to
645 issue that has not been finalized in the socketcan project. Also the 800 the Socketcan-Users mailing list. But please search the archives first.
646 idea of having a library module (candev.ko) that holds functions
647 that are needed by all CAN netdevices is not ready to ship.
648 Your contribution is welcome.
649 801
6507. Credits 8028. Credits
651---------- 803----------
652 804
653 Oliver Hartkopp (PF_CAN core, filters, drivers, bcm) 805 Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
654 Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan) 806 Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
655 Jan Kizka (RT-SocketCAN core, Socket-API reconciliation) 807 Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
656 Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews) 808 Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews,
809 CAN device driver interface, MSCAN driver)
657 Robert Schwebel (design reviews, PTXdist integration) 810 Robert Schwebel (design reviews, PTXdist integration)
658 Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers) 811 Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
659 Benedikt Spranger (reviews) 812 Benedikt Spranger (reviews)
660 Thomas Gleixner (LKML reviews, coding style, posting hints) 813 Thomas Gleixner (LKML reviews, coding style, posting hints)
661 Andrey Volkov (kernel subtree structure, ioctls, mscan driver) 814 Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
662 Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003) 815 Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
663 Klaus Hitschler (PEAK driver integration) 816 Klaus Hitschler (PEAK driver integration)
664 Uwe Koppe (CAN netdevices with PF_PACKET approach) 817 Uwe Koppe (CAN netdevices with PF_PACKET approach)
665 Michael Schulze (driver layer loopback requirement, RT CAN drivers review) 818 Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
819 Pavel Pisa (Bit-timing calculation)
820 Sascha Hauer (SJA1000 platform driver)
821 Sebastian Haas (SJA1000 EMS PCI driver)
822 Markus Plessing (SJA1000 EMS PCI driver)
823 Per Dalen (SJA1000 Kvaser PCI driver)
824 Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/dm9000.txt b/Documentation/networking/dm9000.txt
index 65df3dea5561..5552e2e575c5 100644
--- a/Documentation/networking/dm9000.txt
+++ b/Documentation/networking/dm9000.txt
@@ -129,7 +129,7 @@ PHY Link state polling
129---------------------- 129----------------------
130 130
131The driver keeps track of the link state and informs the network core 131The driver keeps track of the link state and informs the network core
132about link (carrier) availablilty. This is managed by several methods 132about link (carrier) availability. This is managed by several methods
133depending on the version of the chip and on which PHY is being used. 133depending on the version of the chip and on which PHY is being used.
134 134
135For the internal PHY, the original (and currently default) method is 135For the internal PHY, the original (and currently default) method is
diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt
new file mode 100644
index 000000000000..a0280ad2edc9
--- /dev/null
+++ b/Documentation/networking/ieee802154.txt
@@ -0,0 +1,76 @@
1
2 Linux IEEE 802.15.4 implementation
3
4
5Introduction
6============
7
8The Linux-ZigBee project goal is to provide complete implementation
9of IEEE 802.15.4 / ZigBee / 6LoWPAN protocols. IEEE 802.15.4 is a stack
10of protocols for organizing Low-Rate Wireless Personal Area Networks.
11
12Currently only IEEE 802.15.4 layer is implemented. We have choosen
13to use plain Berkeley socket API, the generic Linux networking stack
14to transfer IEEE 802.15.4 messages and a special protocol over genetlink
15for configuration/management
16
17
18Socket API
19==========
20
21int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0);
22.....
23
24The address family, socket addresses etc. are defined in the
25include/net/ieee802154/af_ieee802154.h header or in the special header
26in our userspace package (see either linux-zigbee sourceforge download page
27or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee).
28
29One can use SOCK_RAW for passing raw data towards device xmit function. YMMV.
30
31
32MLME - MAC Level Management
33============================
34
35Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands.
36See the include/net/ieee802154/nl802154.h header. Our userspace tools package
37(see above) provides CLI configuration utility for radio interfaces and simple
38coordinator for IEEE 802.15.4 networks as an example users of MLME protocol.
39
40
41Kernel side
42=============
43
44Like with WiFi, there are several types of devices implementing IEEE 802.15.4.
451) 'HardMAC'. The MAC layer is implemented in the device itself, the device
46 exports MLME and data API.
472) 'SoftMAC' or just radio. These types of devices are just radio transceivers
48 possibly with some kinds of acceleration like automatic CRC computation and
49 comparation, automagic ACK handling, address matching, etc.
50
51Those types of devices require different approach to be hooked into Linux kernel.
52
53
54HardMAC
55=======
56
57See the header include/net/ieee802154/netdevice.h. You have to implement Linux
58net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family
59code via plain sk_buffs. The control block of sk_buffs will contain additional
60info as described in the struct ieee802154_mac_cb.
61
62To hook the MLME interface you have to populate the ml_priv field of your
63net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are
64required.
65
66We provide an example of simple HardMAC driver at drivers/ieee802154/fakehard.c
67
68
69SoftMAC
70=======
71
72We are going to provide intermediate layer impelementing IEEE 802.15.4 MAC
73in software. This is currently WIP.
74
75See header include/net/ieee802154/mac802154.h and several drivers in
76drivers/ieee802154/
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ec5de02f543f..8be76235fe67 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -168,7 +168,16 @@ tcp_dsack - BOOLEAN
168 Allows TCP to send "duplicate" SACKs. 168 Allows TCP to send "duplicate" SACKs.
169 169
170tcp_ecn - BOOLEAN 170tcp_ecn - BOOLEAN
171 Enable Explicit Congestion Notification in TCP. 171 Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
172 used when both ends of the TCP flow support it. It is useful to
173 avoid losses due to congestion (when the bottleneck router supports
174 ECN).
175 Possible values are:
176 0 disable ECN
177 1 ECN enabled
178 2 Only server-side ECN enabled. If the other end does
179 not support ECN, behavior is like with ECN disabled.
180 Default: 2
172 181
173tcp_fack - BOOLEAN 182tcp_fack - BOOLEAN
174 Enable FACK congestion avoidance and fast retransmission. 183 Enable FACK congestion avoidance and fast retransmission.
@@ -1048,6 +1057,13 @@ disable_ipv6 - BOOLEAN
1048 address. 1057 address.
1049 Default: FALSE (enable IPv6 operation) 1058 Default: FALSE (enable IPv6 operation)
1050 1059
1060 When this value is changed from 1 to 0 (IPv6 is being enabled),
1061 it will dynamically create a link-local address on the given
1062 interface and start Duplicate Address Detection, if necessary.
1063
1064 When this value is changed from 0 to 1 (IPv6 is being disabled),
1065 it will dynamically delete all address on the given interface.
1066
1051accept_dad - INTEGER 1067accept_dad - INTEGER
1052 Whether to accept DAD (Duplicate Address Detection). 1068 Whether to accept DAD (Duplicate Address Detection).
1053 0: Disable DAD 1069 0: Disable DAD
@@ -1266,13 +1282,22 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
1266sctp_wmem - vector of 3 INTEGERs: min, default, max 1282sctp_wmem - vector of 3 INTEGERs: min, default, max
1267 See tcp_wmem for a description. 1283 See tcp_wmem for a description.
1268 1284
1269UNDOCUMENTED:
1270 1285
1271/proc/sys/net/core/* 1286/proc/sys/net/core/*
1272 dev_weight FIXME 1287dev_weight - INTEGER
1288 The maximum number of packets that kernel can handle on a NAPI
1289 interrupt, it's a Per-CPU variable.
1290
1291 Default: 64
1273 1292
1274/proc/sys/net/unix/* 1293/proc/sys/net/unix/*
1275 max_dgram_qlen FIXME 1294max_dgram_qlen - INTEGER
1295 The maximum length of dgram socket receive queue
1296
1297 Default: 10
1298
1299
1300UNDOCUMENTED:
1276 1301
1277/proc/sys/net/irda/* 1302/proc/sys/net/irda/*
1278 fast_poll_increase FIXME 1303 fast_poll_increase FIXME
diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
index 268e5c103dd8..9fd7e21296c8 100644
--- a/Documentation/networking/ipv6.txt
+++ b/Documentation/networking/ipv6.txt
@@ -33,3 +33,40 @@ disable
33 33
34 A reboot is required to enable IPv6. 34 A reboot is required to enable IPv6.
35 35
36autoconf
37
38 Specifies whether to enable IPv6 address autoconfiguration
39 on all interfaces. This might be used when one does not wish
40 for addresses to be automatically generated from prefixes
41 received in Router Advertisements.
42
43 The possible values and their effects are:
44
45 0
46 IPv6 address autoconfiguration is disabled on all interfaces.
47
48 Only the IPv6 loopback address (::1) and link-local addresses
49 will be added to interfaces.
50
51 1
52 IPv6 address autoconfiguration is enabled on all interfaces.
53
54 This is the default value.
55
56disable_ipv6
57
58 Specifies whether to disable IPv6 on all interfaces.
59 This might be used when no IPv6 addresses are desired.
60
61 The possible values and their effects are:
62
63 0
64 IPv6 is enabled on all interfaces.
65
66 This is the default value.
67
68 1
69 IPv6 is disabled on all interfaces.
70
71 No IPv6 addresses will be added to interfaces.
72
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt
index 2451f551c505..63214b280e00 100644
--- a/Documentation/networking/l2tp.txt
+++ b/Documentation/networking/l2tp.txt
@@ -158,7 +158,7 @@ Sample Userspace Code
158 } 158 }
159 return 0; 159 return 0;
160 160
161Miscellanous 161Miscellaneous
162============ 162============
163 163
164The PPPoL2TP driver was developed as part of the OpenL2TP project by 164The PPPoL2TP driver was developed as part of the OpenL2TP project by
diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
index 84906ef3ed6e..b30e81ad5307 100644
--- a/Documentation/networking/mac80211-injection.txt
+++ b/Documentation/networking/mac80211-injection.txt
@@ -12,38 +12,22 @@ following format:
12The radiotap format is discussed in 12The radiotap format is discussed in
13./Documentation/networking/radiotap-headers.txt. 13./Documentation/networking/radiotap-headers.txt.
14 14
15Despite 13 radiotap argument types are currently defined, most only make sense 15Despite many radiotap parameters being currently defined, most only make sense
16to appear on received packets. The following information is parsed from the 16to appear on received packets. The following information is parsed from the
17radiotap headers and used to control injection: 17radiotap headers and used to control injection:
18 18
19 * IEEE80211_RADIOTAP_RATE
20
21 rate in 500kbps units, automatic if invalid or not present
22
23
24 * IEEE80211_RADIOTAP_ANTENNA
25
26 antenna to use, automatic if not present
27
28
29 * IEEE80211_RADIOTAP_DBM_TX_POWER
30
31 transmit power in dBm, automatic if not present
32
33
34 * IEEE80211_RADIOTAP_FLAGS 19 * IEEE80211_RADIOTAP_FLAGS
35 20
36 IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated 21 IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated
37 IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available 22 IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available
38 IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the 23 IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the
39 current fragmentation threshold. Note that 24 current fragmentation threshold.
40 this flag is only reliable when software 25
41 fragmentation is enabled)
42 26
43The injection code can also skip all other currently defined radiotap fields 27The injection code can also skip all other currently defined radiotap fields
44facilitating replay of captured radiotap headers directly. 28facilitating replay of captured radiotap headers directly.
45 29
46Here is an example valid radiotap header defining these three parameters 30Here is an example valid radiotap header defining some parameters
47 31
48 0x00, 0x00, // <-- radiotap version 32 0x00, 0x00, // <-- radiotap version
49 0x0b, 0x00, // <- radiotap header length 33 0x0b, 0x00, // <- radiotap header length
@@ -72,8 +56,8 @@ interface), along the following lines:
72... 56...
73 r = pcap_inject(ppcap, u8aSendBuffer, nLength); 57 r = pcap_inject(ppcap, u8aSendBuffer, nLength);
74 58
75You can also find sources for a complete inject test applet here: 59You can also find a link to a complete inject application here:
76 60
77http://penumbra.warmcat.com/_twk/tiki-index.php?page=packetspammer 61http://wireless.kernel.org/en/users/Documentation/packetspammer
78 62
79Andy Green <andy@warmcat.com> 63Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index a2ab6a0b116d..87b3d15f523a 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -74,7 +74,7 @@ dev->hard_start_xmit:
74 for this and return NETDEV_TX_LOCKED when the spin lock fails. 74 for this and return NETDEV_TX_LOCKED when the spin lock fails.
75 The locking there should also properly protect against 75 The locking there should also properly protect against
76 set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated. 76 set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated.
77 Dont use it for new drivers. 77 Don't use it for new drivers.
78 78
79 Context: Process with BHs disabled or BH (timer), 79 Context: Process with BHs disabled or BH (timer),
80 will be called with interrupts disabled by netconsole. 80 will be called with interrupts disabled by netconsole.
diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt
index c9074f9b78bb..1a77a3cfae54 100644
--- a/Documentation/networking/operstates.txt
+++ b/Documentation/networking/operstates.txt
@@ -38,9 +38,6 @@ ifinfomsg::if_flags & IFF_LOWER_UP:
38ifinfomsg::if_flags & IFF_DORMANT: 38ifinfomsg::if_flags & IFF_DORMANT:
39 Driver has signaled netif_dormant_on() 39 Driver has signaled netif_dormant_on()
40 40
41These interface flags can also be queried without netlink using the
42SIOCGIFFLAGS ioctl.
43
44TLV IFLA_OPERSTATE 41TLV IFLA_OPERSTATE
45 42
46contains RFC2863 state of the interface in numeric representation: 43contains RFC2863 state of the interface in numeric representation:
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 07c53d596035..a22fd85e3796 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -4,16 +4,18 @@
4 4
5This file documents the CONFIG_PACKET_MMAP option available with the PACKET 5This file documents the CONFIG_PACKET_MMAP option available with the PACKET
6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
7capture network traffic with utilities like tcpdump or any other that uses 7capture network traffic with utilities like tcpdump or any other that needs
8the libpcap library. 8raw access to network interface.
9
10You can find the latest version of this document at
11 9
10You can find the latest version of this document at:
12 http://pusa.uv.es/~ulisses/packet_mmap/ 11 http://pusa.uv.es/~ulisses/packet_mmap/
13 12
14Please send me your comments to 13Howto can be found at:
14 http://wiki.gnu-log.net (packet_mmap)
15 15
16Please send your comments to
16 Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es> 17 Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
18 Johann Baudy <johann.baudy@gnu-log.net>
17 19
18------------------------------------------------------------------------------- 20-------------------------------------------------------------------------------
19+ Why use PACKET_MMAP 21+ Why use PACKET_MMAP
@@ -25,19 +27,24 @@ to capture each packet, it requires two if you want to get packet's
25timestamp (like libpcap always does). 27timestamp (like libpcap always does).
26 28
27In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size 29In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
28configurable circular buffer mapped in user space. This way reading packets just 30configurable circular buffer mapped in user space that can be used to either
29needs to wait for them, most of the time there is no need to issue a single 31send or receive packets. This way reading packets just needs to wait for them,
30system call. By using a shared buffer between the kernel and the user 32most of the time there is no need to issue a single system call. Concerning
31also has the benefit of minimizing packet copies. 33transmission, multiple packets can be sent through one system call to get the
32 34highest bandwidth.
33It's fine to use PACKET_MMAP to improve the performance of the capture process, 35By using a shared buffer between the kernel and the user also has the benefit
34but it isn't everything. At least, if you are capturing at high speeds (this 36of minimizing packet copies.
35is relative to the cpu speed), you should check if the device driver of your 37
36network interface card supports some sort of interrupt load mitigation or 38It's fine to use PACKET_MMAP to improve the performance of the capture and
37(even better) if it supports NAPI, also make sure it is enabled. 39transmission process, but it isn't everything. At least, if you are capturing
40at high speeds (this is relative to the cpu speed), you should check if the
41device driver of your network interface card supports some sort of interrupt
42load mitigation or (even better) if it supports NAPI, also make sure it is
43enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
44supported by devices of your network.
38 45
39-------------------------------------------------------------------------------- 46--------------------------------------------------------------------------------
40+ How to use CONFIG_PACKET_MMAP 47+ How to use CONFIG_PACKET_MMAP to improve capture process
41-------------------------------------------------------------------------------- 48--------------------------------------------------------------------------------
42 49
43From the user standpoint, you should use the higher level libpcap library, which 50From the user standpoint, you should use the higher level libpcap library, which
@@ -57,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
57support. 64support.
58 65
59-------------------------------------------------------------------------------- 66--------------------------------------------------------------------------------
60+ How to use CONFIG_PACKET_MMAP directly 67+ How to use CONFIG_PACKET_MMAP directly to improve capture process
61-------------------------------------------------------------------------------- 68--------------------------------------------------------------------------------
62 69
63From the system calls stand point, the use of PACKET_MMAP involves 70From the system calls stand point, the use of PACKET_MMAP involves
@@ -66,6 +73,7 @@ the following process:
66 73
67[setup] socket() -------> creation of the capture socket 74[setup] socket() -------> creation of the capture socket
68 setsockopt() ---> allocation of the circular buffer (ring) 75 setsockopt() ---> allocation of the circular buffer (ring)
76 option: PACKET_RX_RING
69 mmap() ---------> mapping of the allocated buffer to the 77 mmap() ---------> mapping of the allocated buffer to the
70 user process 78 user process
71 79
@@ -97,13 +105,75 @@ also the mapping of the circular buffer in the user process and
97the use of this buffer. 105the use of this buffer.
98 106
99-------------------------------------------------------------------------------- 107--------------------------------------------------------------------------------
108+ How to use CONFIG_PACKET_MMAP directly to improve transmission process
109--------------------------------------------------------------------------------
110Transmission process is similar to capture as shown below.
111
112[setup] socket() -------> creation of the transmission socket
113 setsockopt() ---> allocation of the circular buffer (ring)
114 option: PACKET_TX_RING
115 bind() ---------> bind transmission socket with a network interface
116 mmap() ---------> mapping of the allocated buffer to the
117 user process
118
119[transmission] poll() ---------> wait for free packets (optional)
120 send() ---------> send all packets that are set as ready in
121 the ring
122 The flag MSG_DONTWAIT can be used to return
123 before end of transfer.
124
125[shutdown] close() --------> destruction of the transmission socket and
126 deallocation of all associated resources.
127
128Binding the socket to your network interface is mandatory (with zero copy) to
129know the header size of frames used in the circular buffer.
130
131As capture, each frame contains two parts:
132
133 --------------------
134| struct tpacket_hdr | Header. It contains the status of
135| | of this frame
136|--------------------|
137| data buffer |
138. . Data that will be sent over the network interface.
139. .
140 --------------------
141
142 bind() associates the socket to your network interface thanks to
143 sll_ifindex parameter of struct sockaddr_ll.
144
145 Initialization example:
146
147 struct sockaddr_ll my_addr;
148 struct ifreq s_ifr;
149 ...
150
151 strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
152
153 /* get interface index of eth0 */
154 ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
155
156 /* fill sockaddr_ll struct to prepare binding */
157 my_addr.sll_family = AF_PACKET;
158 my_addr.sll_protocol = ETH_P_ALL;
159 my_addr.sll_ifindex = s_ifr.ifr_ifindex;
160
161 /* bind socket to eth0 */
162 bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
163
164 A complete tutorial is available at: http://wiki.gnu-log.net/
165
166--------------------------------------------------------------------------------
100+ PACKET_MMAP settings 167+ PACKET_MMAP settings
101-------------------------------------------------------------------------------- 168--------------------------------------------------------------------------------
102 169
103 170
104To setup PACKET_MMAP from user level code is done with a call like 171To setup PACKET_MMAP from user level code is done with a call like
105 172
173 - Capture process
106 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) 174 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
175 - Transmission process
176 setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
107 177
108The most significant argument in the previous call is the req parameter, 178The most significant argument in the previous call is the req parameter,
109this parameter must to have the following structure: 179this parameter must to have the following structure:
@@ -117,11 +187,11 @@ this parameter must to have the following structure:
117 }; 187 };
118 188
119This structure is defined in /usr/include/linux/if_packet.h and establishes a 189This structure is defined in /usr/include/linux/if_packet.h and establishes a
120circular buffer (ring) of unswappable memory mapped in the capture process. 190circular buffer (ring) of unswappable memory.
121Being mapped in the capture process allows reading the captured frames and 191Being mapped in the capture process allows reading the captured frames and
122related meta-information like timestamps without requiring a system call. 192related meta-information like timestamps without requiring a system call.
123 193
124Captured frames are grouped in blocks. Each block is a physically contiguous 194Frames are grouped in blocks. Each block is a physically contiguous
125region of memory and holds tp_block_size/tp_frame_size frames. The total number 195region of memory and holds tp_block_size/tp_frame_size frames. The total number
126of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because 196of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
127 197
@@ -336,6 +406,7 @@ struct tpacket_hdr). If this field is 0 means that the frame is ready
336to be used for the kernel, If not, there is a frame the user can read 406to be used for the kernel, If not, there is a frame the user can read
337and the following flags apply: 407and the following flags apply:
338 408
409+++ Capture process:
339 from include/linux/if_packet.h 410 from include/linux/if_packet.h
340 411
341 #define TP_STATUS_COPY 2 412 #define TP_STATUS_COPY 2
@@ -391,6 +462,37 @@ packets are in the ring:
391It doesn't incur in a race condition to first check the status value and 462It doesn't incur in a race condition to first check the status value and
392then poll for frames. 463then poll for frames.
393 464
465
466++ Transmission process
467Those defines are also used for transmission:
468
469 #define TP_STATUS_AVAILABLE 0 // Frame is available
470 #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send()
471 #define TP_STATUS_SENDING 2 // Frame is currently in transmission
472 #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct
473
474First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a
475packet, the user fills a data buffer of an available frame, sets tp_len to
476current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST.
477This can be done on multiple frames. Once the user is ready to transmit, it
478calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are
479forwarded to the network device. The kernel updates each status of sent
480frames with TP_STATUS_SENDING until the end of transfer.
481At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
482
483 header->tp_len = in_i_size;
484 header->tp_status = TP_STATUS_SEND_REQUEST;
485 retval = send(this->socket, NULL, 0, 0);
486
487The user can also use poll() to check if a buffer is available:
488(status == TP_STATUS_SENDING)
489
490 struct pollfd pfd;
491 pfd.fd = fd;
492 pfd.revents = 0;
493 pfd.events = POLLOUT;
494 retval = poll(&pfd, 1, timeout);
495
394-------------------------------------------------------------------------------- 496--------------------------------------------------------------------------------
395+ THANKS 497+ THANKS
396-------------------------------------------------------------------------------- 498--------------------------------------------------------------------------------
diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
index 6a07e45d4a93..6e8ce09f9c73 100644
--- a/Documentation/networking/phonet.txt
+++ b/Documentation/networking/phonet.txt
@@ -36,7 +36,7 @@ Phonet packets have a common header as follows:
36On Linux, the link-layer header includes the pn_media byte (see below). 36On Linux, the link-layer header includes the pn_media byte (see below).
37The next 7 bytes are part of the network-layer header. 37The next 7 bytes are part of the network-layer header.
38 38
39The device ID is split: the 6 higher-order bits consitute the device 39The device ID is split: the 6 higher-order bits constitute the device
40address, while the 2 lower-order bits are used for multiplexing, as are 40address, while the 2 lower-order bits are used for multiplexing, as are
41the 8-bit object identifiers. As such, Phonet can be considered as a 41the 8-bit object identifiers. As such, Phonet can be considered as a
42network layer with 6 bits of address space and 10 bits for transport 42network layer with 6 bits of address space and 10 bits for transport
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index dcf31648414a..eaa1a25946c1 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -89,7 +89,7 @@ added to this document when its support is enabled.
89Device drivers who provide their own built regulatory domain 89Device drivers who provide their own built regulatory domain
90do not need a callback as the channels registered by them are 90do not need a callback as the channels registered by them are
91the only ones that will be allowed and therefore *additional* 91the only ones that will be allowed and therefore *additional*
92cannels cannot be enabled. 92channels cannot be enabled.
93 93
94Example code - drivers hinting an alpha2: 94Example code - drivers hinting an alpha2:
95------------------------------------------ 95------------------------------------------
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 421e7d00ffd0..c9abbd86bc18 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -75,9 +75,6 @@ may need to apply in domain-specific ways to their devices:
75struct bus_type { 75struct bus_type {
76 ... 76 ...
77 int (*suspend)(struct device *dev, pm_message_t state); 77 int (*suspend)(struct device *dev, pm_message_t state);
78 int (*suspend_late)(struct device *dev, pm_message_t state);
79
80 int (*resume_early)(struct device *dev);
81 int (*resume)(struct device *dev); 78 int (*resume)(struct device *dev);
82}; 79};
83 80
@@ -226,20 +223,7 @@ The phases are seen by driver notifications issued in this order:
226 223
227 This call should handle parts of device suspend logic that require 224 This call should handle parts of device suspend logic that require
228 sleeping. It probably does work to quiesce the device which hasn't 225 sleeping. It probably does work to quiesce the device which hasn't
229 been abstracted into class.suspend() or bus.suspend_late(). 226 been abstracted into class.suspend().
230
231 3 bus.suspend_late(dev, message) is called with IRQs disabled, and
232 with only one CPU active. Until the bus.resume_early() phase
233 completes (see later), IRQs are not enabled again. This method
234 won't be exposed by all busses; for message based busses like USB,
235 I2C, or SPI, device interactions normally require IRQs. This bus
236 call may be morphed into a driver call with bus-specific parameters.
237
238 This call might save low level hardware state that might otherwise
239 be lost in the upcoming low power state, and actually put the
240 device into a low power state ... so that in some cases the device
241 may stay partly usable until this late. This "late" call may also
242 help when coping with hardware that behaves badly.
243 227
244The pm_message_t parameter is currently used to refine those semantics 228The pm_message_t parameter is currently used to refine those semantics
245(described later). 229(described later).
@@ -351,19 +335,11 @@ devices processing each phase's calls before the next phase begins.
351 335
352The phases are seen by driver notifications issued in this order: 336The phases are seen by driver notifications issued in this order:
353 337
354 1 bus.resume_early(dev) is called with IRQs disabled, and with 338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
355 only one CPU active. As with bus.suspend_late(), this method 339 be morphed into a device driver call with bus-specific parameters;
356 won't be supported on busses that require IRQs in order to 340 implementations may sleep.
357 interact with devices.
358
359 This reverses the effects of bus.suspend_late().
360
361 2 bus.resume(dev) is called next. This may be morphed into a device
362 driver call with bus-specific parameters; implementations may sleep.
363
364 This reverses the effects of bus.suspend().
365 341
366 3 class.resume(dev) is called for devices associated with a class 342 2 class.resume(dev) is called for devices associated with a class
367 that has such a method. Implementations may sleep. 343 that has such a method. Implementations may sleep.
368 344
369 This reverses the effects of class.suspend(), and would usually 345 This reverses the effects of class.suspend(), and would usually
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index 82b7a43aadba..5f83fd24ea84 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -178,5 +178,5 @@ Consumers can uregister interest by calling :-
178int regulator_unregister_notifier(struct regulator *regulator, 178int regulator_unregister_notifier(struct regulator *regulator,
179 struct notifier_block *nb); 179 struct notifier_block *nb);
180 180
181Regulators use the kernel notifier framework to send event to thier interested 181Regulators use the kernel notifier framework to send event to their interested
182consumers. 182consumers.
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index bdcb332bd7fb..0cded696ca01 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -119,7 +119,7 @@ Some terms used in this document:-
119 battery power, USB power) 119 battery power, USB power)
120 120
121 Regulator Domains: is the new current limit within the 121 Regulator Domains: is the new current limit within the
122 regulator operating parameters for input/ouput voltage. 122 regulator operating parameters for input/output voltage.
123 123
124 If the regulator request passes all the constraint tests 124 If the regulator request passes all the constraint tests
125 then the new regulator value is applied. 125 then the new regulator value is applied.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
index 2ebdc6091ce1..514b94fc931e 100644
--- a/Documentation/power/s2ram.txt
+++ b/Documentation/power/s2ram.txt
@@ -63,7 +63,7 @@ hardware during resume operations where a value can be set that will
63survive a reboot. 63survive a reboot.
64 64
65Consequence is that after a resume (even if it is successful) your system 65Consequence is that after a resume (even if it is successful) your system
66clock will have a value corresponding to the magic mumber instead of the 66clock will have a value corresponding to the magic number instead of the
67correct date/time! It is therefore advisable to use a program like ntp-date 67correct date/time! It is therefore advisable to use a program like ntp-date
68or rdate to reset the correct date/time from an external time source when 68or rdate to reset the correct date/time from an external time source when
69using this trace option. 69using this trace option.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 7b99636564c8..b967cd9137d6 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -109,7 +109,7 @@ unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
109still frozen when the device is being closed). 109still frozen when the device is being closed).
110 110
111Currently it is assumed that the userland utilities reading/writing the 111Currently it is assumed that the userland utilities reading/writing the
112snapshot image from/to the kernel will use a swap parition, called the resume 112snapshot image from/to the kernel will use a swap partition, called the resume
113partition, or a swap file as storage space (if a swap file is used, the resume 113partition, or a swap file as storage space (if a swap file is used, the resume
114partition is the partition that holds this file). However, this is not really 114partition is the partition that holds this file). However, this is not really
115required, as they can use, for example, a special (blank) suspend partition or 115required, as they can use, for example, a special (blank) suspend partition or
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index d16b7a1c3793..8d999d862d0e 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1356,7 +1356,7 @@ platforms are moved over to use the flattened-device-tree model.
1356 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY 1356 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
1357 for, used if phy-address is absent. bit 0x00000001 is 1357 for, used if phy-address is absent. bit 0x00000001 is
1358 MDIO address 0. 1358 MDIO address 0.
1359 For Axon it can be absent, thouugh my current driver 1359 For Axon it can be absent, though my current driver
1360 doesn't handle phy-address yet so for now, keep 1360 doesn't handle phy-address yet so for now, keep
1361 0x00ffffff in it. 1361 0x00ffffff in it.
1362 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec 1362 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
@@ -1438,7 +1438,7 @@ platforms are moved over to use the flattened-device-tree model.
1438 1438
1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use 1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range 1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
1441 of standard device types (network, serial, etc.) and miscellanious 1441 of standard device types (network, serial, etc.) and miscellaneous
1442 devices (gpio, LCD, spi, etc). Also, since these devices are 1442 devices (gpio, LCD, spi, etc). Also, since these devices are
1443 implemented within the fpga fabric every instance of the device can be 1443 implemented within the fpga fabric every instance of the device can be
1444 synthesised with different options that change the behaviour. 1444 synthesised with different options that change the behaviour.
diff --git a/Documentation/powerpc/dts-bindings/can/sja1000.txt b/Documentation/powerpc/dts-bindings/can/sja1000.txt
new file mode 100644
index 000000000000..d6d209ded937
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/can/sja1000.txt
@@ -0,0 +1,53 @@
1Memory mapped SJA1000 CAN controller from NXP (formerly Philips)
2
3Required properties:
4
5- compatible : should be "nxp,sja1000".
6
7- reg : should specify the chip select, address offset and size required
8 to map the registers of the SJA1000. The size is usually 0x80.
9
10- interrupts: property with a value describing the interrupt source
11 (number and sensitivity) required for the SJA1000.
12
13Optional properties:
14
15- nxp,external-clock-frequency : Frequency of the external oscillator
16 clock in Hz. Note that the internal clock frequency used by the
17 SJA1000 is half of that value. If not specified, a default value
18 of 16000000 (16 MHz) is used.
19
20- nxp,tx-output-mode : operation mode of the TX output control logic:
21 <0x0> : bi-phase output mode
22 <0x1> : normal output mode (default)
23 <0x2> : test output mode
24 <0x3> : clock output mode
25
26- nxp,tx-output-config : TX output pin configuration:
27 <0x01> : TX0 invert
28 <0x02> : TX0 pull-down (default)
29 <0x04> : TX0 pull-up
30 <0x06> : TX0 push-pull
31 <0x08> : TX1 invert
32 <0x10> : TX1 pull-down
33 <0x20> : TX1 pull-up
34 <0x30> : TX1 push-pull
35
36- nxp,clock-out-frequency : clock frequency in Hz on the CLKOUT pin.
37 If not specified or if the specified value is 0, the CLKOUT pin
38 will be disabled.
39
40- nxp,no-comparator-bypass : Allows to disable the CAN input comperator.
41
42For futher information, please have a look to the SJA1000 data sheet.
43
44Examples:
45
46can@3,100 {
47 compatible = "nxp,sja1000";
48 reg = <3 0x100 0x80>;
49 interrupts = <2 0>;
50 interrupt-parent = <&mpic>;
51 nxp,external-clock-frequency = <16000000>;
52};
53
diff --git a/Documentation/powerpc/dts-bindings/ecm.txt b/Documentation/powerpc/dts-bindings/ecm.txt
new file mode 100644
index 000000000000..f514f29c67d6
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/ecm.txt
@@ -0,0 +1,64 @@
1=====================================================================
2E500 LAW & Coherency Module Device Tree Binding
3Copyright (C) 2009 Freescale Semiconductor Inc.
4=====================================================================
5
6Local Access Window (LAW) Node
7
8The LAW node represents the region of CCSR space where local access
9windows are configured. For ECM based devices this is the first 4k
10of CCSR space that includes CCSRBAR, ALTCBAR, ALTCAR, BPTR, and some
11number of local access windows as specified by fsl,num-laws.
12
13PROPERTIES
14
15 - compatible
16 Usage: required
17 Value type: <string>
18 Definition: Must include "fsl,ecm-law"
19
20 - reg
21 Usage: required
22 Value type: <prop-encoded-array>
23 Definition: A standard property. The value specifies the
24 physical address offset and length of the CCSR space
25 registers.
26
27 - fsl,num-laws
28 Usage: required
29 Value type: <u32>
30 Definition: The value specifies the number of local access
31 windows for this device.
32
33=====================================================================
34
35E500 Coherency Module Node
36
37The E500 LAW node represents the region of CCSR space where ECM config
38and error reporting registers exist, this is the second 4k (0x1000)
39of CCSR space.
40
41PROPERTIES
42
43 - compatible
44 Usage: required
45 Value type: <string>
46 Definition: Must include "fsl,CHIP-ecm", "fsl,ecm" where
47 CHIP is the processor (mpc8572, mpc8544, etc.)
48
49 - reg
50 Usage: required
51 Value type: <prop-encoded-array>
52 Definition: A standard property. The value specifies the
53 physical address offset and length of the CCSR space
54 registers.
55
56 - interrupts
57 Usage: required
58 Value type: <prop-encoded-array>
59
60 - interrupt-parent
61 Usage: required
62 Value type: <phandle>
63
64=====================================================================
diff --git a/Documentation/powerpc/dts-bindings/fsl/board.txt b/Documentation/powerpc/dts-bindings/fsl/board.txt
index 6c974d28eeb4..e8b5bc24d0ac 100644
--- a/Documentation/powerpc/dts-bindings/fsl/board.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/board.txt
@@ -38,7 +38,7 @@ Required properities:
38- reg : Should contain the address and the length of the GPIO bank 38- reg : Should contain the address and the length of the GPIO bank
39 register. 39 register.
40- #gpio-cells : Should be two. The first cell is the pin number and the 40- #gpio-cells : Should be two. The first cell is the pin number and the
41 second cell is used to specify optional paramters (currently unused). 41 second cell is used to specify optional parameters (currently unused).
42- gpio-controller : Marks the port as GPIO controller. 42- gpio-controller : Marks the port as GPIO controller.
43 43
44Example: 44Example:
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
index 088fc471e03a..160c752484b4 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
@@ -19,7 +19,7 @@ Example:
19 reg = <119c0 30>; 19 reg = <119c0 30>;
20 } 20 }
21 21
22* Properties common to mulitple CPM/QE devices 22* Properties common to multiple CPM/QE devices
23 23
24- fsl,cpm-command : This value is ORed with the opcode and command flag 24- fsl,cpm-command : This value is ORed with the opcode and command flag
25 to specify the device on which a CPM command operates. 25 to specify the device on which a CPM command operates.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
index 1815dfede1bc..349f79fd7076 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
@@ -11,7 +11,7 @@ Required properties:
11 "fsl,cpm1-pario-bank-c", "fsl,cpm1-pario-bank-d", 11 "fsl,cpm1-pario-bank-c", "fsl,cpm1-pario-bank-d",
12 "fsl,cpm1-pario-bank-e", "fsl,cpm2-pario-bank" 12 "fsl,cpm1-pario-bank-e", "fsl,cpm2-pario-bank"
13- #gpio-cells : Should be two. The first cell is the pin number and the 13- #gpio-cells : Should be two. The first cell is the pin number and the
14 second cell is used to specify optional paramters (currently unused). 14 second cell is used to specify optional parameters (currently unused).
15- gpio-controller : Marks the port as GPIO controller. 15- gpio-controller : Marks the port as GPIO controller.
16 16
17Example of three SOC GPIO banks defined as gpio-controller nodes: 17Example of three SOC GPIO banks defined as gpio-controller nodes:
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
index 78790d58dc2c..6e37be1eeb2d 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
@@ -17,6 +17,9 @@ Required properties:
17- model : precise model of the QE, Can be "QE", "CPM", or "CPM2" 17- model : precise model of the QE, Can be "QE", "CPM", or "CPM2"
18- reg : offset and length of the device registers. 18- reg : offset and length of the device registers.
19- bus-frequency : the clock frequency for QUICC Engine. 19- bus-frequency : the clock frequency for QUICC Engine.
20- fsl,qe-num-riscs: define how many RISC engines the QE has.
21- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the
22 threads.
20 23
21Recommended properties 24Recommended properties
22- brg-frequency : the internal clock source frequency for baud-rate 25- brg-frequency : the internal clock source frequency for baud-rate
diff --git a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
index 600846557763..5093ddf900da 100644
--- a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
@@ -5,8 +5,7 @@ for MMC, SD, and SDIO types of memory cards.
5 5
6Required properties: 6Required properties:
7 - compatible : should be 7 - compatible : should be
8 "fsl,<chip>-esdhc", "fsl,mpc8379-esdhc" for MPC83xx processors. 8 "fsl,<chip>-esdhc", "fsl,esdhc"
9 "fsl,<chip>-esdhc", "fsl,mpc8536-esdhc" for MPC85xx processors.
10 - reg : should contain eSDHC registers location and length. 9 - reg : should contain eSDHC registers location and length.
11 - interrupts : should contain eSDHC interrupt. 10 - interrupts : should contain eSDHC interrupt.
12 - interrupt-parent : interrupt source phandle. 11 - interrupt-parent : interrupt source phandle.
@@ -15,7 +14,7 @@ Required properties:
15Example: 14Example:
16 15
17sdhci@2e000 { 16sdhci@2e000 {
18 compatible = "fsl,mpc8378-esdhc", "fsl,mpc8379-esdhc"; 17 compatible = "fsl,mpc8378-esdhc", "fsl,esdhc";
19 reg = <0x2e000 0x1000>; 18 reg = <0x2e000 0x1000>;
20 interrupts = <42 0x8>; 19 interrupts = <42 0x8>;
21 interrupt-parent = <&ipic>; 20 interrupt-parent = <&ipic>;
diff --git a/Documentation/powerpc/dts-bindings/fsl/mcm.txt b/Documentation/powerpc/dts-bindings/fsl/mcm.txt
new file mode 100644
index 000000000000..4ceda9b3b413
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/mcm.txt
@@ -0,0 +1,64 @@
1=====================================================================
2MPX LAW & Coherency Module Device Tree Binding
3Copyright (C) 2009 Freescale Semiconductor Inc.
4=====================================================================
5
6Local Access Window (LAW) Node
7
8The LAW node represents the region of CCSR space where local access
9windows are configured. For MCM based devices this is the first 4k
10of CCSR space that includes CCSRBAR, ALTCBAR, ALTCAR, BPTR, and some
11number of local access windows as specified by fsl,num-laws.
12
13PROPERTIES
14
15 - compatible
16 Usage: required
17 Value type: <string>
18 Definition: Must include "fsl,mcm-law"
19
20 - reg
21 Usage: required
22 Value type: <prop-encoded-array>
23 Definition: A standard property. The value specifies the
24 physical address offset and length of the CCSR space
25 registers.
26
27 - fsl,num-laws
28 Usage: required
29 Value type: <u32>
30 Definition: The value specifies the number of local access
31 windows for this device.
32
33=====================================================================
34
35MPX Coherency Module Node
36
37The MPX LAW node represents the region of CCSR space where MCM config
38and error reporting registers exist, this is the second 4k (0x1000)
39of CCSR space.
40
41PROPERTIES
42
43 - compatible
44 Usage: required
45 Value type: <string>
46 Definition: Must include "fsl,CHIP-mcm", "fsl,mcm" where
47 CHIP is the processor (mpc8641, mpc8610, etc.)
48
49 - reg
50 Usage: required
51 Value type: <prop-encoded-array>
52 Definition: A standard property. The value specifies the
53 physical address offset and length of the CCSR space
54 registers.
55
56 - interrupts
57 Usage: required
58 Value type: <prop-encoded-array>
59
60 - interrupt-parent
61 Usage: required
62 Value type: <phandle>
63
64=====================================================================
diff --git a/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
index b26b91992c55..bcc30bac6831 100644
--- a/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
@@ -1,6 +1,6 @@
1* Freescale MSI interrupt controller 1* Freescale MSI interrupt controller
2 2
3Reguired properities: 3Required properties:
4- compatible : compatible list, contains 2 entries, 4- compatible : compatible list, contains 2 entries,
5 first is "fsl,CHIP-msi", where CHIP is the processor(mpc8610, mpc8572, 5 first is "fsl,CHIP-msi", where CHIP is the processor(mpc8610, mpc8572,
6 etc.) and the second is "fsl,mpic-msi" or "fsl,ipic-msi" depending on 6 etc.) and the second is "fsl,mpic-msi" or "fsl,ipic-msi" depending on
diff --git a/Documentation/powerpc/dts-bindings/fsl/pmc.txt b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
index 02f6f43ee1b7..07256b7ffcaa 100644
--- a/Documentation/powerpc/dts-bindings/fsl/pmc.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
@@ -15,8 +15,8 @@ Properties:
15 compatible; all statements below that apply to "fsl,mpc8548-pmc" also 15 compatible; all statements below that apply to "fsl,mpc8548-pmc" also
16 apply to "fsl,mpc8641d-pmc". 16 apply to "fsl,mpc8641d-pmc".
17 17
18 Compatibility does not include bit assigments in SCCR/PMCDR/DEVDISR; these 18 Compatibility does not include bit assignments in SCCR/PMCDR/DEVDISR; these
19 bit assigments are indicated via the sleep specifier in each device's 19 bit assignments are indicated via the sleep specifier in each device's
20 sleep property. 20 sleep property.
21 21
22- reg: For devices compatible with "fsl,mpc8349-pmc", the first resource 22- reg: For devices compatible with "fsl,mpc8349-pmc", the first resource
diff --git a/Documentation/powerpc/qe_firmware.txt b/Documentation/powerpc/qe_firmware.txt
index 06da4d4b44f9..2031ddb33d09 100644
--- a/Documentation/powerpc/qe_firmware.txt
+++ b/Documentation/powerpc/qe_firmware.txt
@@ -225,7 +225,7 @@ For example, to match the 8323, revision 1.0:
225 soc.major = 1 225 soc.major = 1
226 soc.minor = 0 226 soc.minor = 0
227 227
228'padding' is neccessary for structure alignment. This field ensures that the 228'padding' is necessary for structure alignment. This field ensures that the
229'extended_modes' field is aligned on a 64-bit boundary. 229'extended_modes' field is aligned on a 64-bit boundary.
230 230
231'extended_modes' is a bitfield that defines special functionality which has an 231'extended_modes' is a bitfield that defines special functionality which has an
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index 7224459b469e..aae8355d3166 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -131,8 +131,8 @@ Example:
131 } 131 }
132 132
133 /* Add new node and rebalance tree. */ 133 /* Add new node and rebalance tree. */
134 rb_link_node(data->node, parent, new); 134 rb_link_node(&data->node, parent, new);
135 rb_insert_color(data->node, root); 135 rb_insert_color(&data->node, root);
136 136
137 return TRUE; 137 return TRUE;
138 } 138 }
@@ -146,10 +146,10 @@ To remove an existing node from a tree, call:
146 146
147Example: 147Example:
148 148
149 struct mytype *data = mysearch(mytree, "walrus"); 149 struct mytype *data = mysearch(&mytree, "walrus");
150 150
151 if (data) { 151 if (data) {
152 rb_erase(data->node, mytree); 152 rb_erase(&data->node, &mytree);
153 myfree(data); 153 myfree(data);
154 } 154 }
155 155
@@ -188,5 +188,5 @@ Example:
188 188
189 struct rb_node *node; 189 struct rb_node *node;
190 for (node = rb_first(&mytree); node; node = rb_next(node)) 190 for (node = rb_first(&mytree); node; node = rb_next(node))
191 printk("key=%s\n", rb_entry(node, int, keystring)); 191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 192
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 4d3ee317a4a3..1b74b5f30af4 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -1,575 +1,136 @@
1rfkill - RF switch subsystem support 1rfkill - RF kill switch support
2==================================== 2===============================
3 3
41 Introduction 41. Introduction
52 Implementation details 52. Implementation details
63 Kernel driver guidelines 63. Kernel driver guidelines
73.1 wireless device drivers 74. Kernel API
83.2 platform/switch drivers 85. Userspace support
93.3 input device drivers
104 Kernel API
115 Userspace support
12 9
13 10
141. Introduction: 111. Introduction
15 12
16The rfkill switch subsystem exists to add a generic interface to circuitry that 13The rfkill subsystem provides a generic interface to disabling any radio
17can enable or disable the signal output of a wireless *transmitter* of any 14transmitter in the system. When a transmitter is blocked, it shall not
18type. By far, the most common use is to disable radio-frequency transmitters. 15radiate any power.
19 16
20Note that disabling the signal output means that the the transmitter is to be 17The subsystem also provides the ability to react on button presses and
21made to not emit any energy when "blocked". rfkill is not about blocking data 18disable all transmitters of a certain type (or all). This is intended for
22transmissions, it is about blocking energy emission. 19situations where transmitters need to be turned off, for example on
20aircraft.
23 21
24The rfkill subsystem offers support for keys and switches often found on
25laptops to enable wireless devices like WiFi and Bluetooth, so that these keys
26and switches actually perform an action in all wireless devices of a given type
27attached to the system.
28 22
29The buttons to enable and disable the wireless transmitters are important in
30situations where the user is for example using his laptop on a location where
31radio-frequency transmitters _must_ be disabled (e.g. airplanes).
32 23
33Because of this requirement, userspace support for the keys should not be made 242. Implementation details
34mandatory. Because userspace might want to perform some additional smarter
35tasks when the key is pressed, rfkill provides userspace the possibility to
36take over the task to handle the key events.
37
38===============================================================================
392: Implementation details
40 25
41The rfkill subsystem is composed of various components: the rfkill class, the 26The rfkill subsystem is composed of various components: the rfkill class, the
42rfkill-input module (an input layer handler), and some specific input layer 27rfkill-input module (an input layer handler), and some specific input layer
43events. 28events.
44 29
45The rfkill class provides kernel drivers with an interface that allows them to 30The rfkill class is provided for kernel drivers to register their radio
46know when they should enable or disable a wireless network device transmitter. 31transmitter with the kernel, provide methods for turning it on and off and,
47This is enabled by the CONFIG_RFKILL Kconfig option. 32optionally, letting the system know about hardware-disabled states that may
48 33be implemented on the device. This code is enabled with the CONFIG_RFKILL
49The rfkill class support makes sure userspace will be notified of all state 34Kconfig option, which drivers can "select".
50changes on rfkill devices through uevents. It provides a notification chain
51for interested parties in the kernel to also get notified of rfkill state
52changes in other drivers. It creates several sysfs entries which can be used
53by userspace. See section "Userspace support".
54
55The rfkill-input module provides the kernel with the ability to implement a
56basic response when the user presses a key or button (or toggles a switch)
57related to rfkill functionality. It is an in-kernel implementation of default
58policy of reacting to rfkill-related input events and neither mandatory nor
59required for wireless drivers to operate. It is enabled by the
60CONFIG_RFKILL_INPUT Kconfig option.
61
62rfkill-input is a rfkill-related events input layer handler. This handler will
63listen to all rfkill key events and will change the rfkill state of the
64wireless devices accordingly. With this option enabled userspace could either
65do nothing or simply perform monitoring tasks.
66
67The rfkill-input module also provides EPO (emergency power-off) functionality
68for all wireless transmitters. This function cannot be overridden, and it is
69always active. rfkill EPO is related to *_RFKILL_ALL input layer events.
70
71
72Important terms for the rfkill subsystem:
73
74In order to avoid confusion, we avoid the term "switch" in rfkill when it is
75referring to an electronic control circuit that enables or disables a
76transmitter. We reserve it for the physical device a human manipulates
77(which is an input device, by the way):
78
79rfkill switch:
80
81 A physical device a human manipulates. Its state can be perceived by
82 the kernel either directly (through a GPIO pin, ACPI GPE) or by its
83 effect on a rfkill line of a wireless device.
84
85rfkill controller:
86
87 A hardware circuit that controls the state of a rfkill line, which a
88 kernel driver can interact with *to modify* that state (i.e. it has
89 either write-only or read/write access).
90
91rfkill line:
92
93 An input channel (hardware or software) of a wireless device, which
94 causes a wireless transmitter to stop emitting energy (BLOCK) when it
95 is active. Point of view is extremely important here: rfkill lines are
96 always seen from the PoV of a wireless device (and its driver).
97
98soft rfkill line/software rfkill line:
99
100 A rfkill line the wireless device driver can directly change the state
101 of. Related to rfkill_state RFKILL_STATE_SOFT_BLOCKED.
102
103hard rfkill line/hardware rfkill line:
104
105 A rfkill line that works fully in hardware or firmware, and that cannot
106 be overridden by the kernel driver. The hardware device or the
107 firmware just exports its status to the driver, but it is read-only.
108 Related to rfkill_state RFKILL_STATE_HARD_BLOCKED.
109
110The enum rfkill_state describes the rfkill state of a transmitter:
111
112When a rfkill line or rfkill controller is in the RFKILL_STATE_UNBLOCKED state,
113the wireless transmitter (radio TX circuit for example) is *enabled*. When the
114it is in the RFKILL_STATE_SOFT_BLOCKED or RFKILL_STATE_HARD_BLOCKED, the
115wireless transmitter is to be *blocked* from operating.
116
117RFKILL_STATE_SOFT_BLOCKED indicates that a call to toggle_radio() can change
118that state. RFKILL_STATE_HARD_BLOCKED indicates that a call to toggle_radio()
119will not be able to change the state and will return with a suitable error if
120attempts are made to set the state to RFKILL_STATE_UNBLOCKED.
121
122RFKILL_STATE_HARD_BLOCKED is used by drivers to signal that the device is
123locked in the BLOCKED state by a hardwire rfkill line (typically an input pin
124that, when active, forces the transmitter to be disabled) which the driver
125CANNOT override.
126
127Full rfkill functionality requires two different subsystems to cooperate: the
128input layer and the rfkill class. The input layer issues *commands* to the
129entire system requesting that devices registered to the rfkill class change
130state. The way this interaction happens is not complex, but it is not obvious
131either:
132
133Kernel Input layer:
134
135 * Generates KEY_WWAN, KEY_WLAN, KEY_BLUETOOTH, SW_RFKILL_ALL, and
136 other such events when the user presses certain keys, buttons, or
137 toggles certain physical switches.
138
139 THE INPUT LAYER IS NEVER USED TO PROPAGATE STATUS, NOTIFICATIONS OR THE
140 KIND OF STUFF AN ON-SCREEN-DISPLAY APPLICATION WOULD REPORT. It is
141 used to issue *commands* for the system to change behaviour, and these
142 commands may or may not be carried out by some kernel driver or
143 userspace application. It follows that doing user feedback based only
144 on input events is broken, as there is no guarantee that an input event
145 will be acted upon.
146
147 Most wireless communication device drivers implementing rfkill
148 functionality MUST NOT generate these events, and have no reason to
149 register themselves with the input layer. Doing otherwise is a common
150 misconception. There is an API to propagate rfkill status change
151 information, and it is NOT the input layer.
152
153rfkill class:
154
155 * Calls a hook in a driver to effectively change the wireless
156 transmitter state;
157 * Keeps track of the wireless transmitter state (with help from
158 the driver);
159 * Generates userspace notifications (uevents) and a call to a
160 notification chain (kernel) when there is a wireless transmitter
161 state change;
162 * Connects a wireless communications driver with the common rfkill
163 control system, which, for example, allows actions such as
164 "switch all bluetooth devices offline" to be carried out by
165 userspace or by rfkill-input.
166
167 THE RFKILL CLASS NEVER ISSUES INPUT EVENTS. THE RFKILL CLASS DOES
168 NOT LISTEN TO INPUT EVENTS. NO DRIVER USING THE RFKILL CLASS SHALL
169 EVER LISTEN TO, OR ACT ON RFKILL INPUT EVENTS. Doing otherwise is
170 a layering violation.
171
172 Most wireless data communication drivers in the kernel have just to
173 implement the rfkill class API to work properly. Interfacing to the
174 input layer is not often required (and is very often a *bug*) on
175 wireless drivers.
176
177 Platform drivers often have to attach to the input layer to *issue*
178 (but never to listen to) rfkill events for rfkill switches, and also to
179 the rfkill class to export a control interface for the platform rfkill
180 controllers to the rfkill subsystem. This does NOT mean the rfkill
181 switch is attached to a rfkill class (doing so is almost always wrong).
182 It just means the same kernel module is the driver for different
183 devices (rfkill switches and rfkill controllers).
184
185
186Userspace input handlers (uevents) or kernel input handlers (rfkill-input):
187
188 * Implements the policy of what should happen when one of the input
189 layer events related to rfkill operation is received.
190 * Uses the sysfs interface (userspace) or private rfkill API calls
191 to tell the devices registered with the rfkill class to change
192 their state (i.e. translates the input layer event into real
193 action).
194
195 * rfkill-input implements EPO by handling EV_SW SW_RFKILL_ALL 0
196 (power off all transmitters) in a special way: it ignores any
197 overrides and local state cache and forces all transmitters to the
198 RFKILL_STATE_SOFT_BLOCKED state (including those which are already
199 supposed to be BLOCKED).
200 * rfkill EPO will remain active until rfkill-input receives an
201 EV_SW SW_RFKILL_ALL 1 event. While the EPO is active, transmitters
202 are locked in the blocked state (rfkill will refuse to unblock them).
203 * rfkill-input implements different policies that the user can
204 select for handling EV_SW SW_RFKILL_ALL 1. It will unlock rfkill,
205 and either do nothing (leave transmitters blocked, but now unlocked),
206 restore the transmitters to their state before the EPO, or unblock
207 them all.
208
209Userspace uevent handler or kernel platform-specific drivers hooked to the
210rfkill notifier chain:
211
212 * Taps into the rfkill notifier chain or to KOBJ_CHANGE uevents,
213 in order to know when a device that is registered with the rfkill
214 class changes state;
215 * Issues feedback notifications to the user;
216 * In the rare platforms where this is required, synthesizes an input
217 event to command all *OTHER* rfkill devices to also change their
218 statues when a specific rfkill device changes state.
219
220
221===============================================================================
2223: Kernel driver guidelines
223
224Remember: point-of-view is everything for a driver that connects to the rfkill
225subsystem. All the details below must be measured/perceived from the point of
226view of the specific driver being modified.
227
228The first thing one needs to know is whether his driver should be talking to
229the rfkill class or to the input layer. In rare cases (platform drivers), it
230could happen that you need to do both, as platform drivers often handle a
231variety of devices in the same driver.
232
233Do not mistake input devices for rfkill controllers. The only type of "rfkill
234switch" device that is to be registered with the rfkill class are those
235directly controlling the circuits that cause a wireless transmitter to stop
236working (or the software equivalent of them), i.e. what we call a rfkill
237controller. Every other kind of "rfkill switch" is just an input device and
238MUST NOT be registered with the rfkill class.
239
240A driver should register a device with the rfkill class when ALL of the
241following conditions are met (they define a rfkill controller):
242
2431. The device is/controls a data communications wireless transmitter;
244
2452. The kernel can interact with the hardware/firmware to CHANGE the wireless
246 transmitter state (block/unblock TX operation);
247
2483. The transmitter can be made to not emit any energy when "blocked":
249 rfkill is not about blocking data transmissions, it is about blocking
250 energy emission;
251
252A driver should register a device with the input subsystem to issue
253rfkill-related events (KEY_WLAN, KEY_BLUETOOTH, KEY_WWAN, KEY_WIMAX,
254SW_RFKILL_ALL, etc) when ALL of the folowing conditions are met:
255
2561. It is directly related to some physical device the user interacts with, to
257 command the O.S./firmware/hardware to enable/disable a data communications
258 wireless transmitter.
259
260 Examples of the physical device are: buttons, keys and switches the user
261 will press/touch/slide/switch to enable or disable the wireless
262 communication device.
263
2642. It is NOT slaved to another device, i.e. there is no other device that
265 issues rfkill-related input events in preference to this one.
266
267 Please refer to the corner cases and examples section for more details.
268
269When in doubt, do not issue input events. For drivers that should generate
270input events in some platforms, but not in others (e.g. b43), the best solution
271is to NEVER generate input events in the first place. That work should be
272deferred to a platform-specific kernel module (which will know when to generate
273events through the rfkill notifier chain) or to userspace. This avoids the
274usual maintenance problems with DMI whitelisting.
275
276
277Corner cases and examples:
278====================================
279
2801. If the device is an input device that, because of hardware or firmware,
281causes wireless transmitters to be blocked regardless of the kernel's will, it
282is still just an input device, and NOT to be registered with the rfkill class.
283
2842. If the wireless transmitter switch control is read-only, it is an input
285device and not to be registered with the rfkill class (and maybe not to be made
286an input layer event source either, see below).
287
2883. If there is some other device driver *closer* to the actual hardware the
289user interacted with (the button/switch/key) to issue an input event, THAT is
290the device driver that should be issuing input events.
291
292E.g:
293 [RFKILL slider switch] -- [GPIO hardware] -- [WLAN card rf-kill input]
294 (platform driver) (wireless card driver)
295
296The user is closer to the RFKILL slide switch plaform driver, so the driver
297which must issue input events is the platform driver looking at the GPIO
298hardware, and NEVER the wireless card driver (which is just a slave). It is
299very likely that there are other leaves than just the WLAN card rf-kill input
300(e.g. a bluetooth card, etc)...
301
302On the other hand, some embedded devices do this:
303
304 [RFKILL slider switch] -- [WLAN card rf-kill input]
305 (wireless card driver)
306
307In this situation, the wireless card driver *could* register itself as an input
308device and issue rf-kill related input events... but in order to AVOID the need
309for DMI whitelisting, the wireless card driver does NOT do it. Userspace (HAL)
310or a platform driver (that exists only on these embedded devices) will do the
311dirty job of issuing the input events.
312
313
314COMMON MISTAKES in kernel drivers, related to rfkill:
315====================================
316
3171. NEVER confuse input device keys and buttons with input device switches.
318
319 1a. Switches are always set or reset. They report the current state
320 (on position or off position).
321
322 1b. Keys and buttons are either in the pressed or not-pressed state, and
323 that's it. A "button" that latches down when you press it, and
324 unlatches when you press it again is in fact a switch as far as input
325 devices go.
326
327Add the SW_* events you need for switches, do NOT try to emulate a button using
328KEY_* events just because there is no such SW_* event yet. Do NOT try to use,
329for example, KEY_BLUETOOTH when you should be using SW_BLUETOOTH instead.
330
3312. Input device switches (sources of EV_SW events) DO store their current state
332(so you *must* initialize it by issuing a gratuitous input layer event on
333driver start-up and also when resuming from sleep), and that state CAN be
334queried from userspace through IOCTLs. There is no sysfs interface for this,
335but that doesn't mean you should break things trying to hook it to the rfkill
336class to get a sysfs interface :-)
337
3383. Do not issue *_RFKILL_ALL events by default, unless you are sure it is the
339correct event for your switch/button. These events are emergency power-off
340events when they are trying to turn the transmitters off. An example of an
341input device which SHOULD generate *_RFKILL_ALL events is the wireless-kill
342switch in a laptop which is NOT a hotkey, but a real sliding/rocker switch.
343An example of an input device which SHOULD NOT generate *_RFKILL_ALL events by
344default, is any sort of hot key that is type-specific (e.g. the one for WLAN).
345
346
3473.1 Guidelines for wireless device drivers
348------------------------------------------
349
350(in this text, rfkill->foo means the foo field of struct rfkill).
351
3521. Each independent transmitter in a wireless device (usually there is only one
353transmitter per device) should have a SINGLE rfkill class attached to it.
354
3552. If the device does not have any sort of hardware assistance to allow the
356driver to rfkill the device, the driver should emulate it by taking all actions
357required to silence the transmitter.
358
3593. If it is impossible to silence the transmitter (i.e. it still emits energy,
360even if it is just in brief pulses, when there is no data to transmit and there
361is no hardware support to turn it off) do NOT lie to the users. Do not attach
362it to a rfkill class. The rfkill subsystem does not deal with data
363transmission, it deals with energy emission. If the transmitter is emitting
364energy, it is not blocked in rfkill terms.
365
3664. It doesn't matter if the device has multiple rfkill input lines affecting
367the same transmitter, their combined state is to be exported as a single state
368per transmitter (see rule 1).
369
370This rule exists because users of the rfkill subsystem expect to get (and set,
371when possible) the overall transmitter rfkill state, not of a particular rfkill
372line.
373
3745. The wireless device driver MUST NOT leave the transmitter enabled during
375suspend and hibernation unless:
376 35
377 5.1. The transmitter has to be enabled for some sort of functionality 36The rfkill class code also notifies userspace of state changes, this is
378 like wake-on-wireless-packet or autonomous packed forwarding in a mesh 37achieved via uevents. It also provides some sysfs files for userspace to
379 network, and that functionality is enabled for this suspend/hibernation 38check the status of radio transmitters. See the "Userspace support" section
380 cycle. 39below.
381 40
382AND
383 41
384 5.2. The device was not on a user-requested BLOCKED state before 42The rfkill-input code implements a basic response to rfkill buttons -- it
385 the suspend (i.e. the driver must NOT unblock a device, not even 43implements turning on/off all devices of a certain class (or all).
386 to support wake-on-wireless-packet or remain in the mesh).
387 44
388In other words, there is absolutely no allowed scenario where a driver can 45When the device is hard-blocked (either by a call to rfkill_set_hw_state()
389automatically take action to unblock a rfkill controller (obviously, this deals 46or from query_hw_block) set_block() will be invoked but drivers can well
390with scenarios where soft-blocking or both soft and hard blocking is happening. 47ignore the method call since they can use the return value of the function
391Scenarios where hardware rfkill lines are the only ones blocking the 48rfkill_set_hw_state() to sync the software state instead of keeping track
392transmitter are outside of this rule, since the wireless device driver does not 49of calls to set_block().
393control its input hardware rfkill lines in the first place).
394 50
3956. During resume, rfkill will try to restore its previous state.
396 51
3977. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio 52The entire functionality is spread over more than one subsystem:
398until it is resumed.
399 53
54 * The kernel input layer generates KEY_WWAN, KEY_WLAN etc. and
55 SW_RFKILL_ALL -- when the user presses a button. Drivers for radio
56 transmitters generally do not register to the input layer, unless the
57 device really provides an input device (i.e. a button that has no
58 effect other than generating a button press event)
400 59
401Example of a WLAN wireless driver connected to the rfkill subsystem: 60 * The rfkill-input code hooks up to these events and switches the soft-block
402-------------------------------------------------------------------- 61 of the various radio transmitters, depending on the button type.
403 62
404A certain WLAN card has one input pin that causes it to block the transmitter 63 * The rfkill drivers turn off/on their transmitters as requested.
405and makes the status of that input pin available (only for reading!) to the
406kernel driver. This is a hard rfkill input line (it cannot be overridden by
407the kernel driver).
408 64
409The card also has one PCI register that, if manipulated by the driver, causes 65 * The rfkill class will generate userspace notifications (uevents) to tell
410it to block the transmitter. This is a soft rfkill input line. 66 userspace what the current state is.
411 67
412It has also a thermal protection circuitry that shuts down its transmitter if
413the card overheats, and makes the status of that protection available (only for
414reading!) to the kernel driver. This is also a hard rfkill input line.
415 68
416If either one of these rfkill lines are active, the transmitter is blocked by
417the hardware and forced offline.
418 69
419The driver should allocate and attach to its struct device *ONE* instance of 703. Kernel driver guidelines
420the rfkill class (there is only one transmitter).
421 71
422It can implement the get_state() hook, and return RFKILL_STATE_HARD_BLOCKED if
423either one of its two hard rfkill input lines are active. If the two hard
424rfkill lines are inactive, it must return RFKILL_STATE_SOFT_BLOCKED if its soft
425rfkill input line is active. Only if none of the rfkill input lines are
426active, will it return RFKILL_STATE_UNBLOCKED.
427 72
428Since the device has a hardware rfkill line, it IS subject to state changes 73Drivers for radio transmitters normally implement only the rfkill class.
429external to rfkill. Therefore, the driver must make sure that it calls 74These drivers may not unblock the transmitter based on own decisions, they
430rfkill_force_state() to keep the status always up-to-date, and it must do a 75should act on information provided by the rfkill class only.
431rfkill_force_state() on resume from sleep.
432 76
433Every time the driver gets a notification from the card that one of its rfkill 77Platform drivers might implement input devices if the rfkill button is just
434lines changed state (polling might be needed on badly designed cards that don't 78that, a button. If that button influences the hardware then you need to
435generate interrupts for such events), it recomputes the rfkill state as per 79implement an rfkill class instead. This also applies if the platform provides
436above, and calls rfkill_force_state() to update it. 80a way to turn on/off the transmitter(s).
437 81
438The driver should implement the toggle_radio() hook, that: 82During suspend/hibernation, transmitters should only be left enabled when
83wake-on wlan or similar functionality requires it and the device wasn't
84blocked before suspend/hibernate. Note that it may be necessary to update
85the rfkill subsystem's idea of what the current state is at resume time if
86the state may have changed over suspend.
439 87
4401. Returns an error if one of the hardware rfkill lines are active, and the
441caller asked for RFKILL_STATE_UNBLOCKED.
442 88
4432. Activates the soft rfkill line if the caller asked for state
444RFKILL_STATE_SOFT_BLOCKED. It should do this even if one of the hard rfkill
445lines are active, effectively double-blocking the transmitter.
446 89
4473. Deactivates the soft rfkill line if none of the hardware rfkill lines are 904. Kernel API
448active and the caller asked for RFKILL_STATE_UNBLOCKED.
449
450===============================================================================
4514: Kernel API
452 91
453To build a driver with rfkill subsystem support, the driver should depend on 92To build a driver with rfkill subsystem support, the driver should depend on
454(or select) the Kconfig symbol RFKILL; it should _not_ depend on RKFILL_INPUT. 93(or select) the Kconfig symbol RFKILL.
455 94
456The hardware the driver talks to may be write-only (where the current state 95The hardware the driver talks to may be write-only (where the current state
457of the hardware is unknown), or read-write (where the hardware can be queried 96of the hardware is unknown), or read-write (where the hardware can be queried
458about its current state). 97about its current state).
459 98
460The rfkill class will call the get_state hook of a device every time it needs 99Calling rfkill_set_hw_state() when a state change happens is required from
461to know the *real* current state of the hardware. This can happen often, but 100rfkill drivers that control devices that can be hard-blocked unless they also
462it does not do any polling, so it is not enough on hardware that is subject 101assign the poll_hw_block() callback (then the rfkill core will poll the
463to state changes outside of the rfkill subsystem. 102device). Don't do this unless you cannot get the event in any other way.
464
465Therefore, calling rfkill_force_state() when a state change happens is
466mandatory when the device has a hardware rfkill line, or when something else
467like the firmware could cause its state to be changed without going through the
468rfkill class.
469
470Some hardware provides events when its status changes. In these cases, it is
471best for the driver to not provide a get_state hook, and instead register the
472rfkill class *already* with the correct status, and keep it updated using
473rfkill_force_state() when it gets an event from the hardware.
474
475rfkill_force_state() must be used on the device resume handlers to update the
476rfkill status, should there be any chance of the device status changing during
477the sleep.
478
479There is no provision for a statically-allocated rfkill struct. You must
480use rfkill_allocate() to allocate one.
481
482You should:
483 - rfkill_allocate()
484 - modify rfkill fields (flags, name)
485 - modify state to the current hardware state (THIS IS THE ONLY TIME
486 YOU CAN ACCESS state DIRECTLY)
487 - rfkill_register()
488
489The only way to set a device to the RFKILL_STATE_HARD_BLOCKED state is through
490a suitable return of get_state() or through rfkill_force_state().
491 103
492When a device is in the RFKILL_STATE_HARD_BLOCKED state, the only way to switch
493it to a different state is through a suitable return of get_state() or through
494rfkill_force_state().
495 104
496If toggle_radio() is called to set a device to state RFKILL_STATE_SOFT_BLOCKED
497when that device is already at the RFKILL_STATE_HARD_BLOCKED state, it should
498not return an error. Instead, it should try to double-block the transmitter,
499so that its state will change from RFKILL_STATE_HARD_BLOCKED to
500RFKILL_STATE_SOFT_BLOCKED should the hardware blocking cease.
501
502Please refer to the source for more documentation.
503
504===============================================================================
5055: Userspace support
506
507rfkill devices issue uevents (with an action of "change"), with the following
508environment variables set:
509
510RFKILL_NAME
511RFKILL_STATE
512RFKILL_TYPE
513 105
514The ABI for these variables is defined by the sysfs attributes. It is best 1065. Userspace support
515to take a quick look at the source to make sure of the possible values.
516 107
517It is expected that HAL will trap those, and bridge them to DBUS, etc. These 108The following sysfs entries exist for every rfkill device:
518events CAN and SHOULD be used to give feedback to the user about the rfkill
519status of the system.
520
521Input devices may issue events that are related to rfkill. These are the
522various KEY_* events and SW_* events supported by rfkill-input.c.
523
524******IMPORTANT******
525When rfkill-input is ACTIVE, userspace is NOT TO CHANGE THE STATE OF AN RFKILL
526SWITCH IN RESPONSE TO AN INPUT EVENT also handled by rfkill-input, unless it
527has set to true the user_claim attribute for that particular switch. This rule
528is *absolute*; do NOT violate it.
529******IMPORTANT******
530
531Userspace must not assume it is the only source of control for rfkill switches.
532Their state CAN and WILL change due to firmware actions, direct user actions,
533and the rfkill-input EPO override for *_RFKILL_ALL.
534
535When rfkill-input is not active, userspace must initiate a rfkill status
536change by writing to the "state" attribute in order for anything to happen.
537
538Take particular care to implement EV_SW SW_RFKILL_ALL properly. When that
539switch is set to OFF, *every* rfkill device *MUST* be immediately put into the
540RFKILL_STATE_SOFT_BLOCKED state, no questions asked.
541
542The following sysfs entries will be created:
543 109
544 name: Name assigned by driver to this key (interface or driver name). 110 name: Name assigned by driver to this key (interface or driver name).
545 type: Name of the key type ("wlan", "bluetooth", etc). 111 type: Name of the key type ("wlan", "bluetooth", etc).
546 state: Current state of the transmitter 112 state: Current state of the transmitter
547 0: RFKILL_STATE_SOFT_BLOCKED 113 0: RFKILL_STATE_SOFT_BLOCKED
548 transmitter is forced off, but one can override it 114 transmitter is turned off by software
549 by a write to the state attribute;
550 1: RFKILL_STATE_UNBLOCKED 115 1: RFKILL_STATE_UNBLOCKED
551 transmiter is NOT forced off, and may operate if 116 transmitter is (potentially) active
552 all other conditions for such operation are met
553 (such as interface is up and configured, etc);
554 2: RFKILL_STATE_HARD_BLOCKED 117 2: RFKILL_STATE_HARD_BLOCKED
555 transmitter is forced off by something outside of 118 transmitter is forced off by something outside of
556 the driver's control. One cannot set a device to 119 the driver's control.
557 this state through writes to the state attribute; 120 claim: 0: Kernel handles events (currently always reads that value)
558 claim: 1: Userspace handles events, 0: Kernel handles events 121
559 122rfkill devices also issue uevents (with an action of "change"), with the
560Both the "state" and "claim" entries are also writable. For the "state" entry 123following environment variables set:
561this means that when 1 or 0 is written, the device rfkill state (if not yet in 124
562the requested state), will be will be toggled accordingly. 125RFKILL_NAME
563 126RFKILL_STATE
564For the "claim" entry writing 1 to it means that the kernel no longer handles 127RFKILL_TYPE
565key events even though RFKILL_INPUT input was enabled. When "claim" has been 128
566set to 0, userspace should make sure that it listens for the input events or 129The contents of these variables corresponds to the "name", "state" and
567check the sysfs "state" entry regularly to correctly perform the required tasks 130"type" sysfs files explained above.
568when the rkfill key is pressed. 131
569 132An alternative userspace interface exists as a misc device /dev/rfkill,
570A note about input devices and EV_SW events: 133which allows userspace to obtain and set the state of rfkill devices and
571 134sets of devices. It also notifies userspace about device addition and
572In order to know the current state of an input device switch (like 135removal. The API is a simple read/write API that is defined in
573SW_RFKILL_ALL), you will need to use an IOCTL. That information is not 136linux/rfkill.h.
574available through sysfs in a generic way at this time, and it is not available
575through the rfkill class AT ALL.
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
index 10711d9f0788..1eb576a023bd 100644
--- a/Documentation/s390/Debugging390.txt
+++ b/Documentation/s390/Debugging390.txt
@@ -1984,7 +1984,7 @@ break *$pc
1984 1984
1985break *0x400618 1985break *0x400618
1986 1986
1987heres a really useful one for large programs 1987Here's a really useful one for large programs
1988rbr 1988rbr
1989Set a breakpoint for all functions matching REGEXP 1989Set a breakpoint for all functions matching REGEXP
1990e.g. 1990e.g.
@@ -2211,7 +2211,7 @@ Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
2211#5 0x51692c in readline_internal () at readline.c:521 2211#5 0x51692c in readline_internal () at readline.c:521
2212#6 0x5164fe in readline (prompt=0x7ffff810 "\177ÿøx\177ÿ÷Ø\177ÿøxÀ") 2212#6 0x5164fe in readline (prompt=0x7ffff810 "\177ÿøx\177ÿ÷Ø\177ÿøxÀ")
2213 at readline.c:349 2213 at readline.c:349
2214#7 0x4d7a8a in command_line_input (prrompt=0x564420 "(gdb) ", repeat=1, 2214#7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
2215 annotation_suffix=0x4d6b44 "prompt") at top.c:2091 2215 annotation_suffix=0x4d6b44 "prompt") at top.c:2091
2216#8 0x4d6cf0 in command_loop () at top.c:1345 2216#8 0x4d6cf0 in command_loop () at top.c:1345
2217#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 2217#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
diff --git a/Documentation/scheduler/sched-nice-design.txt b/Documentation/scheduler/sched-nice-design.txt
index e2bae5a577e3..3ac1e46d5365 100644
--- a/Documentation/scheduler/sched-nice-design.txt
+++ b/Documentation/scheduler/sched-nice-design.txt
@@ -55,7 +55,7 @@ To sum it up: we always wanted to make nice levels more consistent, but
55within the constraints of HZ and jiffies and their nasty design level 55within the constraints of HZ and jiffies and their nasty design level
56coupling to timeslices and granularity it was not really viable. 56coupling to timeslices and granularity it was not really viable.
57 57
58The second (less frequent but still periodically occuring) complaint 58The second (less frequent but still periodically occurring) complaint
59about Linux's nice level support was its assymetry around the origo 59about Linux's nice level support was its assymetry around the origo
60(which you can see demonstrated in the picture above), or more 60(which you can see demonstrated in the picture above), or more
61accurately: the fact that nice level behavior depended on the _absolute_ 61accurately: the fact that nice level behavior depended on the _absolute_
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3fc625a..1df7f9cdab05 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
4CONTENTS 4CONTENTS
5======== 5========
6 6
70. WARNING
71. Overview 81. Overview
8 1.1 The problem 9 1.1 The problem
9 1.2 The solution 10 1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
143. Future plans 153. Future plans
15 16
16 17
180. WARNING
19==========
20
21 Fiddling with these settings can result in an unstable system, the knobs are
22 root only and assumes root knows what he is doing.
23
24Most notable:
25
26 * very small values in sched_rt_period_us can result in an unstable
27 system when the period is smaller than either the available hrtimer
28 resolution, or the time it takes to handle the budget refresh itself.
29
30 * very small values in sched_rt_runtime_us can result in an unstable
31 system when the runtime is so small the system has difficulty making
32 forward progress (NOTE: the migration thread and kstopmachine both
33 are real-time processes).
34
171. Overview 351. Overview
18=========== 36===========
19 37
@@ -169,7 +187,7 @@ get their allocated time.
169 187
170Implementing SCHED_EDF might take a while to complete. Priority Inheritance is 188Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
171the biggest challenge as the current linux PI infrastructure is geared towards 189the biggest challenge as the current linux PI infrastructure is geared towards
172the limited static priority levels 0-139. With deadline scheduling you need to 190the limited static priority levels 0-99. With deadline scheduling you need to
173do deadline inheritance (since priority is inversely proportional to the 191do deadline inheritance (since priority is inversely proportional to the
174deadline delta (deadline - now). 192deadline delta (deadline - now).
175 193
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 683ccae00ad4..c014eccaf19f 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -194,7 +194,7 @@ The following information is available in this file:
194 - Packetized SCSI Protocol at 160MB/s and 320MB/s 194 - Packetized SCSI Protocol at 160MB/s and 320MB/s
195 - Quick Arbitration Selection (QAS) 195 - Quick Arbitration Selection (QAS)
196 - Retained Training Information (Rev B. ASIC only) 196 - Retained Training Information (Rev B. ASIC only)
197 - Interrupt Coalessing 197 - Interrupt Coalescing
198 - Initiator Mode (target mode not currently 198 - Initiator Mode (target mode not currently
199 supported) 199 supported)
200 - Support for the PCI-X standard up to 133MHz 200 - Support for the PCI-X standard up to 133MHz
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 230e30846ef2..08e2b4d04aab 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -206,7 +206,7 @@ of MOVE MEMORY instructions.
206The 896 and the 895A allows handling of the phase mismatch context from 206The 896 and the 895A allows handling of the phase mismatch context from
207SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor 207SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor
208until the C code has saved the context of the transfer). 208until the C code has saved the context of the transfer).
209Implementing this without using LOAD/STORE instructions would be painfull 209Implementing this without using LOAD/STORE instructions would be painful
210and I didn't even want to try it. 210and I didn't even want to try it.
211 211
212The 896 chip supports 64 bit PCI transactions and addressing, while the 212The 896 chip supports 64 bit PCI transactions and addressing, while the
@@ -240,7 +240,7 @@ characteristics. This feature may also reduce average command latency.
240In order to really gain advantage of this feature, devices must have 240In order to really gain advantage of this feature, devices must have
241a reasonable cache size (No miracle is to be expected for a low-end 241a reasonable cache size (No miracle is to be expected for a low-end
242hard disk with 128 KB or less). 242hard disk with 128 KB or less).
243Some kown SCSI devices do not properly support tagged command queuing. 243Some known SCSI devices do not properly support tagged command queuing.
244Generally, firmware revisions that fix this kind of problems are available 244Generally, firmware revisions that fix this kind of problems are available
245at respective vendor web/ftp sites. 245at respective vendor web/ftp sites.
246All I can say is that the hard disks I use on my machines behave well with 246All I can say is that the hard disks I use on my machines behave well with
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index 49ea5c58c6bc..eb9a7b905b64 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -206,7 +206,7 @@ characteristics. This feature may also reduce average command latency.
206In order to really gain advantage of this feature, devices must have 206In order to really gain advantage of this feature, devices must have
207a reasonable cache size (No miracle is to be expected for a low-end 207a reasonable cache size (No miracle is to be expected for a low-end
208hard disk with 128 KB or less). 208hard disk with 128 KB or less).
209Some kown old SCSI devices do not properly support tagged command queuing. 209Some known old SCSI devices do not properly support tagged command queuing.
210Generally, firmware revisions that fix this kind of problems are available 210Generally, firmware revisions that fix this kind of problems are available
211at respective vendor web/ftp sites. 211at respective vendor web/ftp sites.
212All I can say is that I never have had problem with tagged queuing using 212All I can say is that I never have had problem with tagged queuing using
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 012858d2b119..4252697a95d6 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -460,6 +460,25 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
460 460
461 The power-management is supported. 461 The power-management is supported.
462 462
463 Module snd-ctxfi
464 ----------------
465
466 Module for Creative Sound Blaster X-Fi boards (20k1 / 20k2 chips)
467 * Creative Sound Blaster X-Fi Titanium Fatal1ty Champion Series
468 * Creative Sound Blaster X-Fi Titanium Fatal1ty Professional Series
469 * Creative Sound Blaster X-Fi Titanium Professional Audio
470 * Creative Sound Blaster X-Fi Titanium
471 * Creative Sound Blaster X-Fi Elite Pro
472 * Creative Sound Blaster X-Fi Platinum
473 * Creative Sound Blaster X-Fi Fatal1ty
474 * Creative Sound Blaster X-Fi XtremeGamer
475 * Creative Sound Blaster X-Fi XtremeMusic
476
477 reference_rate - reference sample rate, 44100 or 48000 (default)
478 multiple - multiple to ref. sample rate, 1 or 2 (default)
479
480 This module supports multiple cards.
481
463 Module snd-darla20 482 Module snd-darla20
464 ------------------ 483 ------------------
465 484
@@ -754,7 +773,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
754 single_cmd - Use single immediate commands to communicate with 773 single_cmd - Use single immediate commands to communicate with
755 codecs (for debugging only) 774 codecs (for debugging only)
756 enable_msi - Enable Message Signaled Interrupt (MSI) (default = off) 775 enable_msi - Enable Message Signaled Interrupt (MSI) (default = off)
757 power_save - Automatic power-saving timtout (in second, 0 = 776 power_save - Automatic power-saving timeout (in second, 0 =
758 disable) 777 disable)
759 power_save_controller - Reset HD-audio controller in power-saving mode 778 power_save_controller - Reset HD-audio controller in power-saving mode
760 (default = on) 779 (default = on)
@@ -925,6 +944,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
925 * Onkyo SE-90PCI 944 * Onkyo SE-90PCI
926 * Onkyo SE-200PCI 945 * Onkyo SE-200PCI
927 * ESI Juli@ 946 * ESI Juli@
947 * ESI Maya44
928 * Hercules Fortissimo IV 948 * Hercules Fortissimo IV
929 * EGO-SYS WaveTerminal 192M 949 * EGO-SYS WaveTerminal 192M
930 950
@@ -933,7 +953,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
933 prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192, 953 prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192,
934 juli, aureon51, aureon71, universe, ap192, k8x800, 954 juli, aureon51, aureon71, universe, ap192, k8x800,
935 phase22, phase28, ms300, av710, se200pci, se90pci, 955 phase22, phase28, ms300, av710, se200pci, se90pci,
936 fortissimo4, sn25p, WT192M 956 fortissimo4, sn25p, WT192M, maya44
937 957
938 This module supports multiple cards and autoprobe. 958 This module supports multiple cards and autoprobe.
939 959
@@ -1093,6 +1113,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1093 This module supports multiple cards. 1113 This module supports multiple cards.
1094 The driver requires the firmware loader support on kernel. 1114 The driver requires the firmware loader support on kernel.
1095 1115
1116 Module snd-lx6464es
1117 -------------------
1118
1119 Module for Digigram LX6464ES boards
1120
1121 This module supports multiple cards.
1122
1096 Module snd-maestro3 1123 Module snd-maestro3
1097 ------------------- 1124 -------------------
1098 1125
@@ -1543,13 +1570,15 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1543 Module snd-sc6000 1570 Module snd-sc6000
1544 ----------------- 1571 -----------------
1545 1572
1546 Module for Gallant SC-6000 soundcard. 1573 Module for Gallant SC-6000 soundcard and later models: SC-6600
1574 and SC-7000.
1547 1575
1548 port - Port # (0x220 or 0x240) 1576 port - Port # (0x220 or 0x240)
1549 mss_port - MSS Port # (0x530 or 0xe80) 1577 mss_port - MSS Port # (0x530 or 0xe80)
1550 irq - IRQ # (5,7,9,10,11) 1578 irq - IRQ # (5,7,9,10,11)
1551 mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq 1579 mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq
1552 dma - DMA # (1,3,0) 1580 dma - DMA # (1,3,0)
1581 joystick - Enable gameport - 0 = disable (default), 1 = enable
1553 1582
1554 This module supports multiple cards. 1583 This module supports multiple cards.
1555 1584
@@ -1859,7 +1888,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1859 ------------------- 1888 -------------------
1860 1889
1861 Module for sound cards based on the Asus AV100/AV200 chips, 1890 Module for sound cards based on the Asus AV100/AV200 chips,
1862 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), and Essence STX. 1891 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), Essence ST
1892 (Deluxe) and Essence STX.
1863 1893
1864 This module supports autoprobe and multiple cards. 1894 This module supports autoprobe and multiple cards.
1865 1895
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 8eec05bc079e..de8e10a94103 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -36,6 +36,7 @@ ALC260
36 acer Acer TravelMate 36 acer Acer TravelMate
37 will Will laptops (PB V7900) 37 will Will laptops (PB V7900)
38 replacer Replacer 672V 38 replacer Replacer 672V
39 favorit100 Maxdata Favorit 100XS
39 basic fixed pin assignment (old default model) 40 basic fixed pin assignment (old default model)
40 test for testing/debugging purpose, almost all controls can 41 test for testing/debugging purpose, almost all controls can
41 adjusted. Appearing only when compiled with 42 adjusted. Appearing only when compiled with
@@ -85,10 +86,11 @@ ALC269
85 eeepc-p703 ASUS Eeepc P703 P900A 86 eeepc-p703 ASUS Eeepc P703 P900A
86 eeepc-p901 ASUS Eeepc P901 S101 87 eeepc-p901 ASUS Eeepc P901 S101
87 fujitsu FSC Amilo 88 fujitsu FSC Amilo
89 lifebook Fujitsu Lifebook S6420
88 auto auto-config reading BIOS (default) 90 auto auto-config reading BIOS (default)
89 91
90ALC662/663 92ALC662/663/272
91========== 93==============
92 3stack-dig 3-stack (2-channel) with SPDIF 94 3stack-dig 3-stack (2-channel) with SPDIF
93 3stack-6ch 3-stack (6-channel) 95 3stack-6ch 3-stack (6-channel)
94 3stack-6ch-dig 3-stack (6-channel) with SPDIF 96 3stack-6ch-dig 3-stack (6-channel) with SPDIF
@@ -107,6 +109,9 @@ ALC662/663
107 asus-mode4 ASUS 109 asus-mode4 ASUS
108 asus-mode5 ASUS 110 asus-mode5 ASUS
109 asus-mode6 ASUS 111 asus-mode6 ASUS
112 dell Dell with ALC272
113 dell-zm1 Dell ZM1 with ALC272
114 samsung-nc10 Samsung NC10 mini notebook
110 auto auto-config reading BIOS (default) 115 auto auto-config reading BIOS (default)
111 116
112ALC882/885 117ALC882/885
@@ -118,6 +123,7 @@ ALC882/885
118 asus-a7j ASUS A7J 123 asus-a7j ASUS A7J
119 asus-a7m ASUS A7M 124 asus-a7m ASUS A7M
120 macpro MacPro support 125 macpro MacPro support
126 mb5 Macbook 5,1
121 mbp3 Macbook Pro rev3 127 mbp3 Macbook Pro rev3
122 imac24 iMac 24'' with jack detection 128 imac24 iMac 24'' with jack detection
123 w2jc ASUS W2JC 129 w2jc ASUS W2JC
@@ -133,10 +139,12 @@ ALC883/888
133 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) 139 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
134 acer-aspire Acer Aspire 9810 140 acer-aspire Acer Aspire 9810
135 acer-aspire-4930g Acer Aspire 4930G 141 acer-aspire-4930g Acer Aspire 4930G
142 acer-aspire-8930g Acer Aspire 8930G
136 medion Medion Laptops 143 medion Medion Laptops
137 medion-md2 Medion MD2 144 medion-md2 Medion MD2
138 targa-dig Targa/MSI 145 targa-dig Targa/MSI
139 targa-2ch-dig Targs/MSI with 2-channel 146 targa-2ch-dig Targa/MSI with 2-channel
147 targa-8ch-dig Targa/MSI with 8-channel (MSI GX620)
140 laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) 148 laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE)
141 lenovo-101e Lenovo 101E 149 lenovo-101e Lenovo 101E
142 lenovo-nb0763 Lenovo NB0763 150 lenovo-nb0763 Lenovo NB0763
@@ -150,6 +158,9 @@ ALC883/888
150 fujitsu-pi2515 Fujitsu AMILO Pi2515 158 fujitsu-pi2515 Fujitsu AMILO Pi2515
151 fujitsu-xa3530 Fujitsu AMILO XA3530 159 fujitsu-xa3530 Fujitsu AMILO XA3530
152 3stack-6ch-intel Intel DG33* boards 160 3stack-6ch-intel Intel DG33* boards
161 asus-p5q ASUS P5Q-EM boards
162 mb31 MacBook 3,1
163 sony-vaio-tt Sony VAIO TT
153 auto auto-config reading BIOS (default) 164 auto auto-config reading BIOS (default)
154 165
155ALC861/660 166ALC861/660
@@ -334,6 +345,7 @@ STAC9227/9228/9229/927x
334 ref-no-jd Reference board without HP/Mic jack detection 345 ref-no-jd Reference board without HP/Mic jack detection
335 3stack D965 3stack 346 3stack D965 3stack
336 5stack D965 5stack + SPDIF 347 5stack D965 5stack + SPDIF
348 5stack-no-fp D965 5stack without front panel
337 dell-3stack Dell Dimension E520 349 dell-3stack Dell Dimension E520
338 dell-bios Fixes with Dell BIOS setup 350 dell-bios Fixes with Dell BIOS setup
339 auto BIOS setup (default) 351 auto BIOS setup (default)
@@ -347,6 +359,7 @@ STAC92HD71B*
347 hp-m4 HP mini 1000 359 hp-m4 HP mini 1000
348 hp-dv5 HP dv series 360 hp-dv5 HP dv series
349 hp-hdx HP HDX series 361 hp-hdx HP HDX series
362 hp-dv4-1222nr HP dv4-1222nr (with LED support)
350 auto BIOS setup (default) 363 auto BIOS setup (default)
351 364
352STAC92HD73* 365STAC92HD73*
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 88b7433d2f11..71ac995b1915 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -16,7 +16,7 @@ methods for the HD-audio hardware.
16The HD-audio component consists of two parts: the controller chip and 16The HD-audio component consists of two parts: the controller chip and
17the codec chips on the HD-audio bus. Linux provides a single driver 17the codec chips on the HD-audio bus. Linux provides a single driver
18for all controllers, snd-hda-intel. Although the driver name contains 18for all controllers, snd-hda-intel. Although the driver name contains
19a word of a well-known harware vendor, it's not specific to it but for 19a word of a well-known hardware vendor, it's not specific to it but for
20all controller chips by other companies. Since the HD-audio 20all controller chips by other companies. Since the HD-audio
21controllers are supposed to be compatible, the single snd-hda-driver 21controllers are supposed to be compatible, the single snd-hda-driver
22should work in most cases. But, not surprisingly, there are known 22should work in most cases. But, not surprisingly, there are known
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index bba2dbb79d81..381908d8ca42 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -88,21 +88,34 @@ card*/pcm*/info
88 substreams, etc. 88 substreams, etc.
89 89
90card*/pcm*/xrun_debug 90card*/pcm*/xrun_debug
91 This file appears when CONFIG_SND_DEBUG=y. 91 This file appears when CONFIG_SND_DEBUG=y and
92 This shows the status of xrun (= buffer overrun/xrun) debug of 92 CONFIG_PCM_XRUN_DEBUG=y.
93 ALSA PCM middle layer, as an integer from 0 to 2. The value 93 This shows the status of xrun (= buffer overrun/xrun) and
94 can be changed by writing to this file, such as 94 invalid PCM position debug/check of ALSA PCM middle layer.
95 95 It takes an integer value, can be changed by writing to this
96 # cat 2 > /proc/asound/card0/pcm0p/xrun_debug 96 file, such as
97 97
98 When this value is greater than 0, the driver will show the 98 # cat 5 > /proc/asound/card0/pcm0p/xrun_debug
99 messages to kernel log when an xrun is detected. The debug 99
100 message is shown also when the invalid H/W pointer is detected 100 The value consists of the following bit flags:
101 at the update of periods (usually called from the interrupt 101 bit 0 = Enable XRUN/jiffies debug messages
102 bit 1 = Show stack trace at XRUN / jiffies check
103 bit 2 = Enable additional jiffies check
104
105 When the bit 0 is set, the driver will show the messages to
106 kernel log when an xrun is detected. The debug message is
107 shown also when the invalid H/W pointer is detected at the
108 update of periods (usually called from the interrupt
102 handler). 109 handler).
103 110
104 When this value is greater than 1, the driver will show the 111 When the bit 1 is set, the driver will show the stack trace
105 stack trace additionally. This may help the debugging. 112 additionally. This may help the debugging.
113
114 Since 2.6.30, this option can enable the hwptr check using
115 jiffies. This detects spontaneous invalid pointer callback
116 values, but can be lead to too much corrections for a (mostly
117 buggy) hardware that doesn't give smooth pointer updates.
118 This feature is enabled via the bit 2.
106 119
107card*/pcm*/sub*/info 120card*/pcm*/sub*/info
108 The general information of this PCM sub-stream. 121 The general information of this PCM sub-stream.
diff --git a/Documentation/sound/alsa/README.maya44 b/Documentation/sound/alsa/README.maya44
new file mode 100644
index 000000000000..0e41576fa13e
--- /dev/null
+++ b/Documentation/sound/alsa/README.maya44
@@ -0,0 +1,163 @@
1NOTE: The following is the original document of Rainer's patch that the
2current maya44 code based on. Some contents might be obsoleted, but I
3keep here as reference -- tiwai
4
5----------------------------------------------------------------
6
7STATE OF DEVELOPMENT:
8
9This driver is being developed on the initiative of Piotr Makowski (oponek@gmail.com) and financed by Lars Bergmann.
10Development is carried out by Rainer Zimmermann (mail@lightshed.de).
11
12ESI provided a sample Maya44 card for the development work.
13
14However, unfortunately it has turned out difficult to get detailed programming information, so I (Rainer Zimmermann) had to find out some card-specific information by experiment and conjecture. Some information (in particular, several GPIO bits) is still missing.
15
16This is the first testing version of the Maya44 driver released to the alsa-devel mailing list (Feb 5, 2008).
17
18
19The following functions work, as tested by Rainer Zimmermann and Piotr Makowski:
20
21- playback and capture at all sampling rates
22- input/output level
23- crossmixing
24- line/mic switch
25- phantom power switch
26- analogue monitor a.k.a bypass
27
28
29The following functions *should* work, but are not fully tested:
30
31- Channel 3+4 analogue - S/PDIF input switching
32- S/PDIF output
33- all inputs/outputs on the M/IO/DIO extension card
34- internal/external clock selection
35
36
37*In particular, we would appreciate testing of these functions by anyone who has access to an M/IO/DIO extension card.*
38
39
40Things that do not seem to work:
41
42- The level meters ("multi track") in 'alsamixer' do not seem to react to signals in (if this is a bug, it would probably be in the existing ICE1724 code).
43
44- Ardour 2.1 seems to work only via JACK, not using ALSA directly or via OSS. This still needs to be tracked down.
45
46
47DRIVER DETAILS:
48
49the following files were added:
50
51pci/ice1724/maya44.c - Maya44 specific code
52pci/ice1724/maya44.h
53pci/ice1724/ice1724.patch
54pci/ice1724/ice1724.h.patch - PROPOSED patch to ice1724.h (see SAMPLING RATES)
55i2c/other/wm8776.c - low-level access routines for Wolfson WM8776 codecs
56include/wm8776.h
57
58
59Note that the wm8776.c code is meant to be card-independent and does not actually register the codec with the ALSA infrastructure.
60This is done in maya44.c, mainly because some of the WM8776 controls are used in Maya44-specific ways, and should be named appropriately.
61
62
63the following files were created in pci/ice1724, simply #including the corresponding file from the alsa-kernel tree:
64
65wtm.h
66vt1720_mobo.h
67revo.h
68prodigy192.h
69pontis.h
70phase.h
71maya44.h
72juli.h
73aureon.h
74amp.h
75envy24ht.h
76se.h
77prodigy_hifi.h
78
79
80*I hope this is the correct way to do things.*
81
82
83SAMPLING RATES:
84
85The Maya44 card (or more exactly, the Wolfson WM8776 codecs) allow a maximum sampling rate of 192 kHz for playback and 92 kHz for capture.
86
87As the ICE1724 chip only allows one global sampling rate, this is handled as follows:
88
89* setting the sampling rate on any open PCM device on the maya44 card will always set the *global* sampling rate for all playback and capture channels.
90
91* In the current state of the driver, setting rates of up to 192 kHz is permitted even for capture devices.
92
93*AVOID CAPTURING AT RATES ABOVE 96kHz*, even though it may appear to work. The codec cannot actually capture at such rates, meaning poor quality.
94
95
96I propose some additional code for limiting the sampling rate when setting on a capture pcm device. However because of the global sampling rate, this logic would be somewhat problematic.
97
98The proposed code (currently deactivated) is in ice1712.h.patch, ice1724.c and maya44.c (in pci/ice1712).
99
100
101SOUND DEVICES:
102
103PCM devices correspond to inputs/outputs as follows (assuming Maya44 is card #0):
104
105hw:0,0 input - stereo, analog input 1+2
106hw:0,0 output - stereo, analog output 1+2
107hw:0,1 input - stereo, analog input 3+4 OR S/PDIF input
108hw:0,1 output - stereo, analog output 3+4 (and SPDIF out)
109
110
111NAMING OF MIXER CONTROLS:
112
113(for more information about the signal flow, please refer to the block diagram on p.24 of the ESI Maya44 manual, or in the ESI windows software).
114
115
116PCM: (digital) output level for channel 1+2
117PCM 1: same for channel 3+4
118
119Mic Phantom+48V: switch for +48V phantom power for electrostatic microphones on input 1/2.
120 Make sure this is not turned on while any other source is connected to input 1/2.
121 It might damage the source and/or the maya44 card.
122
123Mic/Line input: if switch is is on, input jack 1/2 is microphone input (mono), otherwise line input (stereo).
124
125Bypass: analogue bypass from ADC input to output for channel 1+2. Same as "Monitor" in the windows driver.
126Bypass 1: same for channel 3+4.
127
128Crossmix: cross-mixer from channels 1+2 to channels 3+4
129Crossmix 1: cross-mixer from channels 3+4 to channels 1+2
130
131IEC958 Output: switch for S/PDIF output.
132 This is not supported by the ESI windows driver.
133 S/PDIF should output the same signal as channel 3+4. [untested!]
134
135
136Digitial output selectors:
137
138 These switches allow a direct digital routing from the ADCs to the DACs.
139 Each switch determines where the digital input data to one of the DACs comes from.
140 They are not supported by the ESI windows driver.
141 For normal operation, they should all be set to "PCM out".
142
143H/W: Output source channel 1
144H/W 1: Output source channel 2
145H/W 2: Output source channel 3
146H/W 3: Output source channel 4
147
148H/W 4 ... H/W 9: unknown function, left in to enable testing.
149 Possibly some of these control S/PDIF output(s).
150 If these turn out to be unused, they will go away in later driver versions.
151
152Selectable values for each of the digital output selectors are:
153 "PCM out" -> DAC output of the corresponding channel (default setting)
154 "Input 1"...
155 "Input 4" -> direct routing from ADC output of the selected input channel
156
157
158--------
159
160Feb 14, 2008
161Rainer Zimmermann
162mail@lightshed.de
163
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
index 34e87ec1379c..de8efbc7e4bd 100644
--- a/Documentation/sound/alsa/hda_codec.txt
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -114,7 +114,7 @@ For writing a sequence of verbs, use snd_hda_sequence_write().
114 114
115There are variants of cached read/write, snd_hda_codec_write_cache(), 115There are variants of cached read/write, snd_hda_codec_write_cache(),
116snd_hda_sequence_write_cache(). These are used for recording the 116snd_hda_sequence_write_cache(). These are used for recording the
117register states for the power-mangement resume. When no PM is needed, 117register states for the power-management resume. When no PM is needed,
118these are equivalent with non-cached version. 118these are equivalent with non-cached version.
119 119
120To retrieve the number of sub nodes connected to the given node, use 120To retrieve the number of sub nodes connected to the given node, use
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 9e6763264a2e..9ac842be9b4f 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -62,6 +62,7 @@ Audio DAPM widgets fall into a number of types:-
62 o Mic - Mic (and optional Jack) 62 o Mic - Mic (and optional Jack)
63 o Line - Line Input/Output (and optional Jack) 63 o Line - Line Input/Output (and optional Jack)
64 o Speaker - Speaker 64 o Speaker - Speaker
65 o Supply - Power or clock supply widget used by other widgets.
65 o Pre - Special PRE widget (exec before all others) 66 o Pre - Special PRE widget (exec before all others)
66 o Post - Special POST widget (exec after all others) 67 o Post - Special POST widget (exec after all others)
67 68
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f11ca7979fa6..322a00bb99d9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -32,6 +32,7 @@ show up in /proc/sys/kernel:
32- kstack_depth_to_print [ X86 only ] 32- kstack_depth_to_print [ X86 only ]
33- l2cr [ PPC only ] 33- l2cr [ PPC only ]
34- modprobe ==> Documentation/debugging-modules.txt 34- modprobe ==> Documentation/debugging-modules.txt
35- modules_disabled
35- msgmax 36- msgmax
36- msgmnb 37- msgmnb
37- msgmni 38- msgmni
@@ -184,6 +185,16 @@ kernel stack.
184 185
185============================================================== 186==============================================================
186 187
188modules_disabled:
189
190A toggle value indicating if modules are allowed to be loaded
191in an otherwise modular kernel. This toggle defaults to off
192(0), but can be set true (1). Once true, modules can be
193neither loaded nor unloaded, and the toggle cannot be set back
194to false.
195
196==============================================================
197
187osrelease, ostype & version: 198osrelease, ostype & version:
188 199
189# cat osrelease 200# cat osrelease
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index c302ddf629a0..c4de6359d440 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used
233for page allocation or should be reclaimed. 233for page allocation or should be reclaimed.
234 234
235In this example, if normal pages (index=2) are required to this DMA zone and 235In this example, if normal pages (index=2) are required to this DMA zone and
236pages_high is used for watermark, the kernel judges this zone should not be 236watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
237used because pages_free(1355) is smaller than watermark + protection[2] 237not be used because pages_free(1355) is smaller than watermark + protection[2]
238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for 238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
239normal page requirement. If requirement is DMA zone(index=0), protection[0] 239normal page requirement. If requirement is DMA zone(index=0), protection[0]
240(=0) is used. 240(=0) is used.
@@ -280,9 +280,10 @@ The default value is 65536.
280min_free_kbytes: 280min_free_kbytes:
281 281
282This is used to force the Linux VM to keep a minimum number 282This is used to force the Linux VM to keep a minimum number
283of kilobytes free. The VM uses this number to compute a pages_min 283of kilobytes free. The VM uses this number to compute a
284value for each lowmem zone in the system. Each lowmem zone gets 284watermark[WMARK_MIN] value for each lowmem zone in the system.
285a number of reserved free pages based proportionally on its size. 285Each lowmem zone gets a number of reserved free pages based
286proportionally on its size.
286 287
287Some minimal amount of memory is needed to satisfy PF_MEMALLOC 288Some minimal amount of memory is needed to satisfy PF_MEMALLOC
288allocations; if you set this to lower than 1024KB, your system will 289allocations; if you set this to lower than 1024KB, your system will
@@ -314,10 +315,14 @@ min_unmapped_ratio:
314 315
315This is available only on NUMA kernels. 316This is available only on NUMA kernels.
316 317
317A percentage of the total pages in each zone. Zone reclaim will only 318This is a percentage of the total pages in each zone. Zone reclaim will
318occur if more than this percentage of pages are file backed and unmapped. 319only occur if more than this percentage of pages are in a state that
319This is to insure that a minimal amount of local pages is still available for 320zone_reclaim_mode allows to be reclaimed.
320file I/O even if the node is overallocated. 321
322If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
323against all file-backed unmapped pages including swapcache pages and tmpfs
324files. Otherwise, only unmapped pages backed by normal files but not tmpfs
325files and similar are considered.
321 326
322The default is 1 percent. 327The default is 1 percent.
323 328
@@ -358,7 +363,7 @@ nr_pdflush_threads
358The current number of pdflush threads. This value is read-only. 363The current number of pdflush threads. This value is read-only.
359The value changes according to the number of dirty pages in the system. 364The value changes according to the number of dirty pages in the system.
360 365
361When neccessary, additional pdflush threads are created, one per second, up to 366When necessary, additional pdflush threads are created, one per second, up to
362nr_pdflush_threads_max. 367nr_pdflush_threads_max.
363 368
364============================================================== 369==============================================================
@@ -565,7 +570,7 @@ swappiness
565 570
566This control is used to define how aggressive the kernel will swap 571This control is used to define how aggressive the kernel will swap
567memory pages. Higher values will increase agressiveness, lower values 572memory pages. Higher values will increase agressiveness, lower values
568descrease the amount of swap. 573decrease the amount of swap.
569 574
570The default value is 60. 575The default value is 60.
571 576
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt
index e7c09abcfab4..04763a325520 100644
--- a/Documentation/timers/hpet.txt
+++ b/Documentation/timers/hpet.txt
@@ -7,7 +7,7 @@ by Intel and Microsoft which can be found at
7 7
8Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision") 8Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
9and up to 32 comparators. Normally three or more comparators are provided, 9and up to 32 comparators. Normally three or more comparators are provided,
10each of which can generate oneshot interupts and at least one of which has 10each of which can generate oneshot interrupts and at least one of which has
11additional hardware to support periodic interrupts. The comparators are 11additional hardware to support periodic interrupts. The comparators are
12also called "timers", which can be misleading since usually timers are 12also called "timers", which can be misleading since usually timers are
13independent of each other ... these share a counter, complicating resets. 13independent of each other ... these share a counter, complicating resets.
diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
index 20d368c59814..9bd00fc2e823 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -62,7 +62,7 @@ Timerstats sample period: 3.888770 s
62 62
63The first column is the number of events, the second column the pid, the third 63The first column is the number of events, the second column the pid, the third
64column is the name of the process. The forth column shows the function which 64column is the name of the process. The forth column shows the function which
65initialized the timer and in parantheses the callback function which was 65initialized the timer and in parenthesis the callback function which was
66executed on expiry. 66executed on expiry.
67 67
68 Thomas, Ingo 68 Thomas, Ingo
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
new file mode 100644
index 000000000000..f157d7594ea7
--- /dev/null
+++ b/Documentation/trace/events.txt
@@ -0,0 +1,90 @@
1 Event Tracing
2
3 Documentation written by Theodore Ts'o
4 Updated by Li Zefan
5
61. Introduction
7===============
8
9Tracepoints (see Documentation/trace/tracepoints.txt) can be used
10without creating custom kernel modules to register probe functions
11using the event tracing infrastructure.
12
13Not all tracepoints can be traced using the event tracing system;
14the kernel developer must provide code snippets which define how the
15tracing information is saved into the tracing buffer, and how the
16tracing information should be printed.
17
182. Using Event Tracing
19======================
20
212.1 Via the 'set_event' interface
22---------------------------------
23
24The events which are available for tracing can be found in the file
25/debug/tracing/available_events.
26
27To enable a particular event, such as 'sched_wakeup', simply echo it
28to /debug/tracing/set_event. For example:
29
30 # echo sched_wakeup >> /debug/tracing/set_event
31
32[ Note: '>>' is necessary, otherwise it will firstly disable
33 all the events. ]
34
35To disable an event, echo the event name to the set_event file prefixed
36with an exclamation point:
37
38 # echo '!sched_wakeup' >> /debug/tracing/set_event
39
40To disable all events, echo an empty line to the set_event file:
41
42 # echo > /debug/tracing/set_event
43
44To enable all events, echo '*:*' or '*:' to the set_event file:
45
46 # echo *:* > /debug/tracing/set_event
47
48The events are organized into subsystems, such as ext4, irq, sched,
49etc., and a full event name looks like this: <subsystem>:<event>. The
50subsystem name is optional, but it is displayed in the available_events
51file. All of the events in a subsystem can be specified via the syntax
52"<subsystem>:*"; for example, to enable all irq events, you can use the
53command:
54
55 # echo 'irq:*' > /debug/tracing/set_event
56
572.2 Via the 'enable' toggle
58---------------------------
59
60The events available are also listed in /debug/tracing/events/ hierarchy
61of directories.
62
63To enable event 'sched_wakeup':
64
65 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
66
67To disable it:
68
69 # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
70
71To enable all events in sched subsystem:
72
73 # echo 1 > /debug/tracing/events/sched/enable
74
75To eanble all events:
76
77 # echo 1 > /debug/tracing/events/enable
78
79When reading one of these enable files, there are four results:
80
81 0 - all events this file affects are disabled
82 1 - all events this file affects are enabled
83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event
85
863. Defining an event-enabled tracepoint
87=======================================
88
89See The example provided in samples/trace_events
90
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..a39b3c749de5 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -7,7 +7,6 @@ Copyright 2008 Red Hat Inc.
7 (dual licensed under the GPL v2) 7 (dual licensed under the GPL v2)
8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton, 8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton,
9 John Kacur, and David Teigland. 9 John Kacur, and David Teigland.
10
11Written for: 2.6.28-rc2 10Written for: 2.6.28-rc2
12 11
13Introduction 12Introduction
@@ -33,13 +32,26 @@ The File System
33Ftrace uses the debugfs file system to hold the control files as 32Ftrace uses the debugfs file system to hold the control files as
34well as the files to display output. 33well as the files to display output.
35 34
36To mount the debugfs system: 35When debugfs is configured into the kernel (which selecting any ftrace
36option will do) the directory /sys/kernel/debug will be created. To mount
37this directory, you can add to your /etc/fstab file:
38
39 debugfs /sys/kernel/debug debugfs defaults 0 0
40
41Or you can mount it at run time with:
42
43 mount -t debugfs nodev /sys/kernel/debug
37 44
38 # mkdir /debug 45For quicker access to that directory you may want to make a soft link to
39 # mount -t debugfs nodev /debug 46it:
40 47
41( Note: it is more common to mount at /sys/kernel/debug, but for 48 ln -s /sys/kernel/debug /debug
42 simplicity this document will use /debug) 49
50Any selected ftrace option will also create a directory called tracing
51within the debugfs. The rest of the document will assume that you are in
52the ftrace directory (cd /sys/kernel/debug/tracing) and will only concentrate
53on the files within that directory and not distract from the content with
54the extended "/sys/kernel/debug/tracing" path name.
43 55
44That's it! (assuming that you have ftrace configured into your kernel) 56That's it! (assuming that you have ftrace configured into your kernel)
45 57
@@ -179,7 +191,7 @@ Here is the list of current tracers that may be configured.
179 191
180 Function call tracer to trace all kernel functions. 192 Function call tracer to trace all kernel functions.
181 193
182 "function_graph_tracer" 194 "function_graph"
183 195
184 Similar to the function tracer except that the 196 Similar to the function tracer except that the
185 function tracer probes the functions on their entry 197 function tracer probes the functions on their entry
@@ -389,18 +401,18 @@ trace_options
389The trace_options file is used to control what gets printed in 401The trace_options file is used to control what gets printed in
390the trace output. To see what is available, simply cat the file: 402the trace output. To see what is available, simply cat the file:
391 403
392 cat /debug/tracing/trace_options 404 cat trace_options
393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 405 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 406 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
395 407
396To disable one of the options, echo in the option prepended with 408To disable one of the options, echo in the option prepended with
397"no". 409"no".
398 410
399 echo noprint-parent > /debug/tracing/trace_options 411 echo noprint-parent > trace_options
400 412
401To enable an option, leave off the "no". 413To enable an option, leave off the "no".
402 414
403 echo sym-offset > /debug/tracing/trace_options 415 echo sym-offset > trace_options
404 416
405Here are the available options: 417Here are the available options:
406 418
@@ -476,11 +488,11 @@ sched_switch
476This tracer simply records schedule switches. Here is an example 488This tracer simply records schedule switches. Here is an example
477of how to use it. 489of how to use it.
478 490
479 # echo sched_switch > /debug/tracing/current_tracer 491 # echo sched_switch > current_tracer
480 # echo 1 > /debug/tracing/tracing_enabled 492 # echo 1 > tracing_enabled
481 # sleep 1 493 # sleep 1
482 # echo 0 > /debug/tracing/tracing_enabled 494 # echo 0 > tracing_enabled
483 # cat /debug/tracing/trace 495 # cat trace
484 496
485# tracer: sched_switch 497# tracer: sched_switch
486# 498#
@@ -518,9 +530,18 @@ priority with zero (0) being the highest priority and the nice
518values starting at 100 (nice -20). Below is a quick chart to map 530values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities. 531the kernel priority to user land priorities.
520 532
521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 533 Kernel Space User Space
522 Kernel priority: 100 to 139 ==> user nice -20 to 19 534 ===============================================================
523 Kernel priority: 140 ==> idle task priority 535 0(high) to 98(low) user RT priority 99(high) to 1(low)
536 with SCHED_RR or SCHED_FIFO
537 ---------------------------------------------------------------
538 99 sched_priority is not used in scheduling
539 decisions(it must be specified as 0)
540 ---------------------------------------------------------------
541 100(high) to 139(low) user nice -20(high) to 19(low)
542 ---------------------------------------------------------------
543 140 idle task priority
544 ---------------------------------------------------------------
524 545
525The task states are: 546The task states are:
526 547
@@ -574,13 +595,13 @@ new trace is saved.
574To reset the maximum, echo 0 into tracing_max_latency. Here is 595To reset the maximum, echo 0 into tracing_max_latency. Here is
575an example: 596an example:
576 597
577 # echo irqsoff > /debug/tracing/current_tracer 598 # echo irqsoff > current_tracer
578 # echo 0 > /debug/tracing/tracing_max_latency 599 # echo 0 > tracing_max_latency
579 # echo 1 > /debug/tracing/tracing_enabled 600 # echo 1 > tracing_enabled
580 # ls -ltr 601 # ls -ltr
581 [...] 602 [...]
582 # echo 0 > /debug/tracing/tracing_enabled 603 # echo 0 > tracing_enabled
583 # cat /debug/tracing/latency_trace 604 # cat latency_trace
584# tracer: irqsoff 605# tracer: irqsoff
585# 606#
586irqsoff latency trace v1.1.5 on 2.6.26 607irqsoff latency trace v1.1.5 on 2.6.26
@@ -681,13 +702,13 @@ Like the irqsoff tracer, it records the maximum latency for
681which preemption was disabled. The control of preemptoff tracer 702which preemption was disabled. The control of preemptoff tracer
682is much like the irqsoff tracer. 703is much like the irqsoff tracer.
683 704
684 # echo preemptoff > /debug/tracing/current_tracer 705 # echo preemptoff > current_tracer
685 # echo 0 > /debug/tracing/tracing_max_latency 706 # echo 0 > tracing_max_latency
686 # echo 1 > /debug/tracing/tracing_enabled 707 # echo 1 > tracing_enabled
687 # ls -ltr 708 # ls -ltr
688 [...] 709 [...]
689 # echo 0 > /debug/tracing/tracing_enabled 710 # echo 0 > tracing_enabled
690 # cat /debug/tracing/latency_trace 711 # cat latency_trace
691# tracer: preemptoff 712# tracer: preemptoff
692# 713#
693preemptoff latency trace v1.1.5 on 2.6.26-rc8 714preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -828,13 +849,13 @@ tracer.
828Again, using this trace is much like the irqsoff and preemptoff 849Again, using this trace is much like the irqsoff and preemptoff
829tracers. 850tracers.
830 851
831 # echo preemptirqsoff > /debug/tracing/current_tracer 852 # echo preemptirqsoff > current_tracer
832 # echo 0 > /debug/tracing/tracing_max_latency 853 # echo 0 > tracing_max_latency
833 # echo 1 > /debug/tracing/tracing_enabled 854 # echo 1 > tracing_enabled
834 # ls -ltr 855 # ls -ltr
835 [...] 856 [...]
836 # echo 0 > /debug/tracing/tracing_enabled 857 # echo 0 > tracing_enabled
837 # cat /debug/tracing/latency_trace 858 # cat latency_trace
838# tracer: preemptirqsoff 859# tracer: preemptirqsoff
839# 860#
840preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 861preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -990,12 +1011,12 @@ slightly differently than we did with the previous tracers.
990Instead of performing an 'ls', we will run 'sleep 1' under 1011Instead of performing an 'ls', we will run 'sleep 1' under
991'chrt' which changes the priority of the task. 1012'chrt' which changes the priority of the task.
992 1013
993 # echo wakeup > /debug/tracing/current_tracer 1014 # echo wakeup > current_tracer
994 # echo 0 > /debug/tracing/tracing_max_latency 1015 # echo 0 > tracing_max_latency
995 # echo 1 > /debug/tracing/tracing_enabled 1016 # echo 1 > tracing_enabled
996 # chrt -f 5 sleep 1 1017 # chrt -f 5 sleep 1
997 # echo 0 > /debug/tracing/tracing_enabled 1018 # echo 0 > tracing_enabled
998 # cat /debug/tracing/latency_trace 1019 # cat latency_trace
999# tracer: wakeup 1020# tracer: wakeup
1000# 1021#
1001wakeup latency trace v1.1.5 on 2.6.26-rc8 1022wakeup latency trace v1.1.5 on 2.6.26-rc8
@@ -1105,11 +1126,11 @@ can be done from the debug file system. Make sure the
1105ftrace_enabled is set; otherwise this tracer is a nop. 1126ftrace_enabled is set; otherwise this tracer is a nop.
1106 1127
1107 # sysctl kernel.ftrace_enabled=1 1128 # sysctl kernel.ftrace_enabled=1
1108 # echo function > /debug/tracing/current_tracer 1129 # echo function > current_tracer
1109 # echo 1 > /debug/tracing/tracing_enabled 1130 # echo 1 > tracing_enabled
1110 # usleep 1 1131 # usleep 1
1111 # echo 0 > /debug/tracing/tracing_enabled 1132 # echo 0 > tracing_enabled
1112 # cat /debug/tracing/trace 1133 # cat trace
1113# tracer: function 1134# tracer: function
1114# 1135#
1115# TASK-PID CPU# TIMESTAMP FUNCTION 1136# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1146,7 +1167,7 @@ int trace_fd;
1146[...] 1167[...]
1147int main(int argc, char *argv[]) { 1168int main(int argc, char *argv[]) {
1148 [...] 1169 [...]
1149 trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY); 1170 trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
1150 [...] 1171 [...]
1151 if (condition_hit()) { 1172 if (condition_hit()) {
1152 write(trace_fd, "0", 1); 1173 write(trace_fd, "0", 1);
@@ -1154,26 +1175,20 @@ int main(int argc, char *argv[]) {
1154 [...] 1175 [...]
1155} 1176}
1156 1177
1157Note: Here we hard coded the path name. The debugfs mount is not
1158guaranteed to be at /debug (and is more commonly at
1159/sys/kernel/debug). For simple one time traces, the above is
1160sufficent. For anything else, a search through /proc/mounts may
1161be needed to find where the debugfs file-system is mounted.
1162
1163 1178
1164Single thread tracing 1179Single thread tracing
1165--------------------- 1180---------------------
1166 1181
1167By writing into /debug/tracing/set_ftrace_pid you can trace a 1182By writing into set_ftrace_pid you can trace a
1168single thread. For example: 1183single thread. For example:
1169 1184
1170# cat /debug/tracing/set_ftrace_pid 1185# cat set_ftrace_pid
1171no pid 1186no pid
1172# echo 3111 > /debug/tracing/set_ftrace_pid 1187# echo 3111 > set_ftrace_pid
1173# cat /debug/tracing/set_ftrace_pid 1188# cat set_ftrace_pid
11743111 11893111
1175# echo function > /debug/tracing/current_tracer 1190# echo function > current_tracer
1176# cat /debug/tracing/trace | head 1191# cat trace | head
1177 # tracer: function 1192 # tracer: function
1178 # 1193 #
1179 # TASK-PID CPU# TIMESTAMP FUNCTION 1194 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1184,8 +1199,8 @@ no pid
1184 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel 1199 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
1185 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll 1200 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll
1186 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll 1201 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll
1187# echo -1 > /debug/tracing/set_ftrace_pid 1202# echo -1 > set_ftrace_pid
1188# cat /debug/tracing/trace |head 1203# cat trace |head
1189 # tracer: function 1204 # tracer: function
1190 # 1205 #
1191 # TASK-PID CPU# TIMESTAMP FUNCTION 1206 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1207,6 +1222,51 @@ something like this simple program:
1207#include <fcntl.h> 1222#include <fcntl.h>
1208#include <unistd.h> 1223#include <unistd.h>
1209 1224
1225#define _STR(x) #x
1226#define STR(x) _STR(x)
1227#define MAX_PATH 256
1228
1229const char *find_debugfs(void)
1230{
1231 static char debugfs[MAX_PATH+1];
1232 static int debugfs_found;
1233 char type[100];
1234 FILE *fp;
1235
1236 if (debugfs_found)
1237 return debugfs;
1238
1239 if ((fp = fopen("/proc/mounts","r")) == NULL) {
1240 perror("/proc/mounts");
1241 return NULL;
1242 }
1243
1244 while (fscanf(fp, "%*s %"
1245 STR(MAX_PATH)
1246 "s %99s %*s %*d %*d\n",
1247 debugfs, type) == 2) {
1248 if (strcmp(type, "debugfs") == 0)
1249 break;
1250 }
1251 fclose(fp);
1252
1253 if (strcmp(type, "debugfs") != 0) {
1254 fprintf(stderr, "debugfs not mounted");
1255 return NULL;
1256 }
1257
1258 debugfs_found = 1;
1259
1260 return debugfs;
1261}
1262
1263const char *tracing_file(const char *file_name)
1264{
1265 static char trace_file[MAX_PATH+1];
1266 snprintf(trace_file, MAX_PATH, "%s/%s", find_debugfs(), file_name);
1267 return trace_file;
1268}
1269
1210int main (int argc, char **argv) 1270int main (int argc, char **argv)
1211{ 1271{
1212 if (argc < 1) 1272 if (argc < 1)
@@ -1217,12 +1277,12 @@ int main (int argc, char **argv)
1217 char line[64]; 1277 char line[64];
1218 int s; 1278 int s;
1219 1279
1220 ffd = open("/debug/tracing/current_tracer", O_WRONLY); 1280 ffd = open(tracing_file("current_tracer"), O_WRONLY);
1221 if (ffd < 0) 1281 if (ffd < 0)
1222 exit(-1); 1282 exit(-1);
1223 write(ffd, "nop", 3); 1283 write(ffd, "nop", 3);
1224 1284
1225 fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); 1285 fd = open(tracing_file("set_ftrace_pid"), O_WRONLY);
1226 s = sprintf(line, "%d\n", getpid()); 1286 s = sprintf(line, "%d\n", getpid());
1227 write(fd, line, s); 1287 write(fd, line, s);
1228 1288
@@ -1374,22 +1434,22 @@ want, depending on your needs.
1374 tracing_cpu_mask file) or you might sometimes see unordered 1434 tracing_cpu_mask file) or you might sometimes see unordered
1375 function calls while cpu tracing switch. 1435 function calls while cpu tracing switch.
1376 1436
1377 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options 1437 hide: echo nofuncgraph-cpu > trace_options
1378 show: echo funcgraph-cpu > /debug/tracing/trace_options 1438 show: echo funcgraph-cpu > trace_options
1379 1439
1380- The duration (function's time of execution) is displayed on 1440- The duration (function's time of execution) is displayed on
1381 the closing bracket line of a function or on the same line 1441 the closing bracket line of a function or on the same line
1382 than the current function in case of a leaf one. It is default 1442 than the current function in case of a leaf one. It is default
1383 enabled. 1443 enabled.
1384 1444
1385 hide: echo nofuncgraph-duration > /debug/tracing/trace_options 1445 hide: echo nofuncgraph-duration > trace_options
1386 show: echo funcgraph-duration > /debug/tracing/trace_options 1446 show: echo funcgraph-duration > trace_options
1387 1447
1388- The overhead field precedes the duration field in case of 1448- The overhead field precedes the duration field in case of
1389 reached duration thresholds. 1449 reached duration thresholds.
1390 1450
1391 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options 1451 hide: echo nofuncgraph-overhead > trace_options
1392 show: echo funcgraph-overhead > /debug/tracing/trace_options 1452 show: echo funcgraph-overhead > trace_options
1393 depends on: funcgraph-duration 1453 depends on: funcgraph-duration
1394 1454
1395 ie: 1455 ie:
@@ -1418,8 +1478,8 @@ want, depending on your needs.
1418- The task/pid field displays the thread cmdline and pid which 1478- The task/pid field displays the thread cmdline and pid which
1419 executed the function. It is default disabled. 1479 executed the function. It is default disabled.
1420 1480
1421 hide: echo nofuncgraph-proc > /debug/tracing/trace_options 1481 hide: echo nofuncgraph-proc > trace_options
1422 show: echo funcgraph-proc > /debug/tracing/trace_options 1482 show: echo funcgraph-proc > trace_options
1423 1483
1424 ie: 1484 ie:
1425 1485
@@ -1442,8 +1502,8 @@ want, depending on your needs.
1442 system clock since it started. A snapshot of this time is 1502 system clock since it started. A snapshot of this time is
1443 given on each entry/exit of functions 1503 given on each entry/exit of functions
1444 1504
1445 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options 1505 hide: echo nofuncgraph-abstime > trace_options
1446 show: echo funcgraph-abstime > /debug/tracing/trace_options 1506 show: echo funcgraph-abstime > trace_options
1447 1507
1448 ie: 1508 ie:
1449 1509
@@ -1540,7 +1600,7 @@ listed in:
1540 1600
1541 available_filter_functions 1601 available_filter_functions
1542 1602
1543 # cat /debug/tracing/available_filter_functions 1603 # cat available_filter_functions
1544put_prev_task_idle 1604put_prev_task_idle
1545kmem_cache_create 1605kmem_cache_create
1546pick_next_task_rt 1606pick_next_task_rt
@@ -1552,12 +1612,12 @@ mutex_lock
1552If I am only interested in sys_nanosleep and hrtimer_interrupt: 1612If I am only interested in sys_nanosleep and hrtimer_interrupt:
1553 1613
1554 # echo sys_nanosleep hrtimer_interrupt \ 1614 # echo sys_nanosleep hrtimer_interrupt \
1555 > /debug/tracing/set_ftrace_filter 1615 > set_ftrace_filter
1556 # echo ftrace > /debug/tracing/current_tracer 1616 # echo ftrace > current_tracer
1557 # echo 1 > /debug/tracing/tracing_enabled 1617 # echo 1 > tracing_enabled
1558 # usleep 1 1618 # usleep 1
1559 # echo 0 > /debug/tracing/tracing_enabled 1619 # echo 0 > tracing_enabled
1560 # cat /debug/tracing/trace 1620 # cat trace
1561# tracer: ftrace 1621# tracer: ftrace
1562# 1622#
1563# TASK-PID CPU# TIMESTAMP FUNCTION 1623# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1568,7 +1628,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
1568 1628
1569To see which functions are being traced, you can cat the file: 1629To see which functions are being traced, you can cat the file:
1570 1630
1571 # cat /debug/tracing/set_ftrace_filter 1631 # cat set_ftrace_filter
1572hrtimer_interrupt 1632hrtimer_interrupt
1573sys_nanosleep 1633sys_nanosleep
1574 1634
@@ -1588,7 +1648,7 @@ Note: It is better to use quotes to enclose the wild cards,
1588 otherwise the shell may expand the parameters into names 1648 otherwise the shell may expand the parameters into names
1589 of files in the local directory. 1649 of files in the local directory.
1590 1650
1591 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1651 # echo 'hrtimer_*' > set_ftrace_filter
1592 1652
1593Produces: 1653Produces:
1594 1654
@@ -1609,7 +1669,7 @@ Produces:
1609 1669
1610Notice that we lost the sys_nanosleep. 1670Notice that we lost the sys_nanosleep.
1611 1671
1612 # cat /debug/tracing/set_ftrace_filter 1672 # cat set_ftrace_filter
1613hrtimer_run_queues 1673hrtimer_run_queues
1614hrtimer_run_pending 1674hrtimer_run_pending
1615hrtimer_init 1675hrtimer_init
@@ -1635,17 +1695,17 @@ To append to the filters, use '>>'
1635To clear out a filter so that all functions will be recorded 1695To clear out a filter so that all functions will be recorded
1636again: 1696again:
1637 1697
1638 # echo > /debug/tracing/set_ftrace_filter 1698 # echo > set_ftrace_filter
1639 # cat /debug/tracing/set_ftrace_filter 1699 # cat set_ftrace_filter
1640 # 1700 #
1641 1701
1642Again, now we want to append. 1702Again, now we want to append.
1643 1703
1644 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter 1704 # echo sys_nanosleep > set_ftrace_filter
1645 # cat /debug/tracing/set_ftrace_filter 1705 # cat set_ftrace_filter
1646sys_nanosleep 1706sys_nanosleep
1647 # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter 1707 # echo 'hrtimer_*' >> set_ftrace_filter
1648 # cat /debug/tracing/set_ftrace_filter 1708 # cat set_ftrace_filter
1649hrtimer_run_queues 1709hrtimer_run_queues
1650hrtimer_run_pending 1710hrtimer_run_pending
1651hrtimer_init 1711hrtimer_init
@@ -1668,7 +1728,7 @@ hrtimer_init_sleeper
1668The set_ftrace_notrace prevents those functions from being 1728The set_ftrace_notrace prevents those functions from being
1669traced. 1729traced.
1670 1730
1671 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1731 # echo '*preempt*' '*lock*' > set_ftrace_notrace
1672 1732
1673Produces: 1733Produces:
1674 1734
@@ -1758,13 +1818,13 @@ the effect on the tracing is different. Every read from
1758trace_pipe is consumed. This means that subsequent reads will be 1818trace_pipe is consumed. This means that subsequent reads will be
1759different. The trace is live. 1819different. The trace is live.
1760 1820
1761 # echo function > /debug/tracing/current_tracer 1821 # echo function > current_tracer
1762 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1822 # cat trace_pipe > /tmp/trace.out &
1763[1] 4153 1823[1] 4153
1764 # echo 1 > /debug/tracing/tracing_enabled 1824 # echo 1 > tracing_enabled
1765 # usleep 1 1825 # usleep 1
1766 # echo 0 > /debug/tracing/tracing_enabled 1826 # echo 0 > tracing_enabled
1767 # cat /debug/tracing/trace 1827 # cat trace
1768# tracer: function 1828# tracer: function
1769# 1829#
1770# TASK-PID CPU# TIMESTAMP FUNCTION 1830# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1800,7 +1860,7 @@ number listed is the number of entries that can be recorded per
1800CPU. To know the full size, multiply the number of possible CPUS 1860CPU. To know the full size, multiply the number of possible CPUS
1801with the number of entries. 1861with the number of entries.
1802 1862
1803 # cat /debug/tracing/buffer_size_kb 1863 # cat buffer_size_kb
18041408 (units kilobytes) 18641408 (units kilobytes)
1805 1865
1806Note, to modify this, you must have tracing completely disabled. 1866Note, to modify this, you must have tracing completely disabled.
@@ -1808,21 +1868,21 @@ To do that, echo "nop" into the current_tracer. If the
1808current_tracer is not set to "nop", an EINVAL error will be 1868current_tracer is not set to "nop", an EINVAL error will be
1809returned. 1869returned.
1810 1870
1811 # echo nop > /debug/tracing/current_tracer 1871 # echo nop > current_tracer
1812 # echo 10000 > /debug/tracing/buffer_size_kb 1872 # echo 10000 > buffer_size_kb
1813 # cat /debug/tracing/buffer_size_kb 1873 # cat buffer_size_kb
181410000 (units kilobytes) 187410000 (units kilobytes)
1815 1875
1816The number of pages which will be allocated is limited to a 1876The number of pages which will be allocated is limited to a
1817percentage of available memory. Allocating too much will produce 1877percentage of available memory. Allocating too much will produce
1818an error. 1878an error.
1819 1879
1820 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1880 # echo 1000000000000 > buffer_size_kb
1821-bash: echo: write error: Cannot allocate memory 1881-bash: echo: write error: Cannot allocate memory
1822 # cat /debug/tracing/buffer_size_kb 1882 # cat buffer_size_kb
182385 188385
1824 1884
1825----------- 1885-----------
1826 1886
1827More details can be found in the source code, in the 1887More details can be found in the source code, in the
1828kernel/tracing/*.c files. 1888kernel/trace/*.c files.
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
index a956d9b7f943..6308735e58ca 100644
--- a/Documentation/trace/kmemtrace.txt
+++ b/Documentation/trace/kmemtrace.txt
@@ -64,7 +64,7 @@ III. Quick usage guide
64CONFIG_KMEMTRACE). 64CONFIG_KMEMTRACE).
65 65
662) Get the userspace tool and build it: 662) Get the userspace tool and build it:
67$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository 67$ git clone git://repo.or.cz/kmemtrace-user.git # current repository
68$ cd kmemtrace-user/ 68$ cd kmemtrace-user/
69$ ./autogen.sh 69$ ./autogen.sh
70$ ./configure 70$ ./configure
diff --git a/Documentation/trace/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..162effbfbdec 100644
--- a/Documentation/trace/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
@@ -32,41 +32,41 @@ is no way to automatically detect if you are losing events due to CPUs racing.
32Usage Quick Reference 32Usage Quick Reference
33--------------------- 33---------------------
34 34
35$ mount -t debugfs debugfs /debug 35$ mount -t debugfs debugfs /sys/kernel/debug
36$ echo mmiotrace > /debug/tracing/current_tracer 36$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
37$ cat /debug/tracing/trace_pipe > mydump.txt & 37$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
38Start X or whatever. 38Start X or whatever.
39$ echo "X is up" > /debug/tracing/trace_marker 39$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
40$ echo nop > /debug/tracing/current_tracer 40$ echo nop > /sys/kernel/debug/tracing/current_tracer
41Check for lost events. 41Check for lost events.
42 42
43 43
44Usage 44Usage
45----- 45-----
46 46
47Make sure debugfs is mounted to /debug. If not, (requires root privileges) 47Make sure debugfs is mounted to /sys/kernel/debug. If not, (requires root privileges)
48$ mount -t debugfs debugfs /debug 48$ mount -t debugfs debugfs /sys/kernel/debug
49 49
50Check that the driver you are about to trace is not loaded. 50Check that the driver you are about to trace is not loaded.
51 51
52Activate mmiotrace (requires root privileges): 52Activate mmiotrace (requires root privileges):
53$ echo mmiotrace > /debug/tracing/current_tracer 53$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
54 54
55Start storing the trace: 55Start storing the trace:
56$ cat /debug/tracing/trace_pipe > mydump.txt & 56$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
57The 'cat' process should stay running (sleeping) in the background. 57The 'cat' process should stay running (sleeping) in the background.
58 58
59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO 59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
60accesses to areas that are ioremapped while mmiotrace is active. 60accesses to areas that are ioremapped while mmiotrace is active.
61 61
62During tracing you can place comments (markers) into the trace by 62During tracing you can place comments (markers) into the trace by
63$ echo "X is up" > /debug/tracing/trace_marker 63$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
64This makes it easier to see which part of the (huge) trace corresponds to 64This makes it easier to see which part of the (huge) trace corresponds to
65which action. It is recommended to place descriptive markers about what you 65which action. It is recommended to place descriptive markers about what you
66do. 66do.
67 67
68Shut down mmiotrace (requires root privileges): 68Shut down mmiotrace (requires root privileges):
69$ echo nop > /debug/tracing/current_tracer 69$ echo nop > /sys/kernel/debug/tracing/current_tracer
70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and 70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
71pressing ctrl+c. 71pressing ctrl+c.
72 72
@@ -78,10 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If
78events were lost, the trace is incomplete. You should enlarge the buffers and 78events were lost, the trace is incomplete. You should enlarge the buffers and
79try again. Buffers are enlarged by first seeing how large the current buffers 79try again. Buffers are enlarged by first seeing how large the current buffers
80are: 80are:
81$ cat /debug/tracing/buffer_size_kb 81$ cat /sys/kernel/debug/tracing/buffer_size_kb
82gives you a number. Approximately double this number and write it back, for 82gives you a number. Approximately double this number and write it back, for
83instance: 83instance:
84$ echo 128000 > /debug/tracing/buffer_size_kb 84$ echo 128000 > /sys/kernel/debug/tracing/buffer_size_kb
85Then start again from the top. 85Then start again from the top.
86 86
87If you are doing a trace for a driver project, e.g. Nouveau, you should also 87If you are doing a trace for a driver project, e.g. Nouveau, you should also
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
new file mode 100644
index 000000000000..cd805e16dc27
--- /dev/null
+++ b/Documentation/trace/power.txt
@@ -0,0 +1,17 @@
1The power tracer collects detailed information about C-state and P-state
2transitions, instead of just looking at the high-level "average"
3information.
4
5There is a helper script found in scrips/tracing/power.pl in the kernel
6sources which can be used to parse this information and create a
7Scalable Vector Graphics (SVG) picture from the trace data.
8
9To use this tracer:
10
11 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
12 echo power > /sys/kernel/debug/tracing/current_tracer
13 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
14 sleep 1
15 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
16 cat /sys/kernel/debug/tracing/trace | \
17 perl scripts/tracing/power.pl > out.sv
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
index 4c3d62c7843a..c480e9c32dbd 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -84,7 +84,7 @@ The different logical parts of this driver are:
84 84
85 *UWB*: the Ultra-Wide-Band stack -- manages the radio and 85 *UWB*: the Ultra-Wide-Band stack -- manages the radio and
86 associated spectrum to allow for devices sharing it. Allows to 86 associated spectrum to allow for devices sharing it. Allows to
87 control bandwidth assingment, beaconing, scanning, etc 87 control bandwidth assignment, beaconing, scanning, etc
88 88
89 * 89 *
90 90
@@ -184,7 +184,7 @@ and sends the replies and notifications back to the API
184[/uwb_rc_neh_grok()/]. Notifications are handled to the UWB daemon, that 184[/uwb_rc_neh_grok()/]. Notifications are handled to the UWB daemon, that
185is chartered, among other things, to keep the tab of how the UWB radio 185is chartered, among other things, to keep the tab of how the UWB radio
186neighborhood looks, creating and destroying devices as they show up or 186neighborhood looks, creating and destroying devices as they show up or
187dissapear. 187disappear.
188 188
189Command execution is very simple: a command block is sent and a event 189Command execution is very simple: a command block is sent and a event
190block or reply is expected back. For sending/receiving command/events, a 190block or reply is expected back. For sending/receiving command/events, a
@@ -333,7 +333,7 @@ read descriptors and move our data.
333 333
334*Device life cycle and keep alives* 334*Device life cycle and keep alives*
335 335
336Everytime there is a succesful transfer to/from a device, we update a 336Every time there is a successful transfer to/from a device, we update a
337per-device activity timestamp. If not, every now and then we check and 337per-device activity timestamp. If not, every now and then we check and
338if the activity timestamp gets old, we ping the device by sending it a 338if the activity timestamp gets old, we ping the device by sending it a
339Keep Alive IE; it responds with a /DN_Alive/ pong during the DNTS (this 339Keep Alive IE; it responds with a /DN_Alive/ pong during the DNTS (this
@@ -411,7 +411,7 @@ context (wa_xfer) and submit it. When the xfer is done, our callback is
411called and we assign the status bits and release the xfer resources. 411called and we assign the status bits and release the xfer resources.
412 412
413In dequeue() we are basically cancelling/aborting the transfer. We issue 413In dequeue() we are basically cancelling/aborting the transfer. We issue
414a xfer abort request to the HC, cancell all the URBs we had submitted 414a xfer abort request to the HC, cancel all the URBs we had submitted
415and not yet done and when all that is done, the xfer callback will be 415and not yet done and when all that is done, the xfer callback will be
416called--this will call the URB callback. 416called--this will call the URB callback.
417 417
diff --git a/Documentation/usb/anchors.txt b/Documentation/usb/anchors.txt
index 6f24f566955a..fe6a99a32bbd 100644
--- a/Documentation/usb/anchors.txt
+++ b/Documentation/usb/anchors.txt
@@ -27,7 +27,7 @@ Association and disassociation of URBs with anchors
27 27
28An association of URBs to an anchor is made by an explicit 28An association of URBs to an anchor is made by an explicit
29call to usb_anchor_urb(). The association is maintained until 29call to usb_anchor_urb(). The association is maintained until
30an URB is finished by (successfull) completion. Thus disassociation 30an URB is finished by (successful) completion. Thus disassociation
31is automatic. A function is provided to forcibly finish (kill) 31is automatic. A function is provided to forcibly finish (kill)
32all URBs associated with an anchor. 32all URBs associated with an anchor.
33Furthermore, disassociation can be made with usb_unanchor_urb() 33Furthermore, disassociation can be made with usb_unanchor_urb()
@@ -76,4 +76,4 @@ usb_get_from_anchor()
76Returns the oldest anchored URB of an anchor. The URB is unanchored 76Returns the oldest anchored URB of an anchor. The URB is unanchored
77and returned with a reference. As you may mix URBs to several 77and returned with a reference. As you may mix URBs to several
78destinations in one anchor you have no guarantee the chronologically 78destinations in one anchor you have no guarantee the chronologically
79first submitted URB is returned. \ No newline at end of file 79first submitted URB is returned.
diff --git a/Documentation/usb/callbacks.txt b/Documentation/usb/callbacks.txt
index 7c812411945b..bfb36b34b79e 100644
--- a/Documentation/usb/callbacks.txt
+++ b/Documentation/usb/callbacks.txt
@@ -65,7 +65,7 @@ Accept or decline an interface. If you accept the device return 0,
65otherwise -ENODEV or -ENXIO. Other error codes should be used only if a 65otherwise -ENODEV or -ENXIO. Other error codes should be used only if a
66genuine error occurred during initialisation which prevented a driver 66genuine error occurred during initialisation which prevented a driver
67from accepting a device that would else have been accepted. 67from accepting a device that would else have been accepted.
68You are strongly encouraged to use usbcore'sfacility, 68You are strongly encouraged to use usbcore's facility,
69usb_set_intfdata(), to associate a data structure with an interface, so 69usb_set_intfdata(), to associate a data structure with an interface, so
70that you know which internal state and identity you associate with a 70that you know which internal state and identity you associate with a
71particular interface. The device will not be suspended and you may do IO 71particular interface. The device will not be suspended and you may do IO
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 91aa3c0f0dd2..450b8f8c389b 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -16,3 +16,8 @@
16 15 -> TeVii S470 [d470:9022] 16 15 -> TeVii S470 [d470:9022]
17 16 -> DVBWorld DVB-S2 2005 [0001:2005] 17 16 -> DVBWorld DVB-S2 2005 [0001:2005]
18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c] 18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c]
19 18 -> Hauppauge WinTV-HVR1270 [0070:2211]
20 19 -> Hauppauge WinTV-HVR1275 [0070:2215]
21 20 -> Hauppauge WinTV-HVR1255 [0070:2251]
22 21 -> Hauppauge WinTV-HVR1210 [0070:2291,0070:2295]
23 22 -> Mygica X8506 DMB-TH [14f1:8651]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 71e9db0b26f7..89093f531727 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -78,3 +78,5 @@
78 77 -> TBS 8910 DVB-S [8910:8888] 78 77 -> TBS 8910 DVB-S [8910:8888]
79 78 -> Prof 6200 DVB-S [b022:3022] 79 78 -> Prof 6200 DVB-S [b022:3022]
80 79 -> Terratec Cinergy HT PCI MKII [153b:1177] 80 79 -> Terratec Cinergy HT PCI MKII [153b:1177]
81 80 -> Hauppauge WinTV-IR Only [0070:9290]
82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 78d0a6eed571..a98a688c11b8 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -17,7 +17,7 @@
17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b]
18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502] 19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502]
20 19 -> PointNix Intra-Oral Camera (em2860) 20 19 -> EM2860/SAA711X Reference Design (em2860)
21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002] 21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002]
22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801] 22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801]
23 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751] 23 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751]
@@ -61,3 +61,7 @@
61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303] 61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303]
62 64 -> Easy Cap Capture DC-60 (em2860) 62 64 -> Easy Cap Capture DC-60 (em2860)
63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515] 63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515]
64 66 -> Empire dual TV (em2880)
65 67 -> Terratec Grabby (em2860) [0ccd:0096]
66 68 -> Terratec AV350 (em2860) [0ccd:0084]
67 69 -> KWorld ATSC 315U HDTV TV Box (em2882) [eb1a:a313]
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index 6dacf2825259..15562427e8a9 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -124,10 +124,10 @@
124123 -> Beholder BeholdTV 407 [0000:4070] 124123 -> Beholder BeholdTV 407 [0000:4070]
125124 -> Beholder BeholdTV 407 FM [0000:4071] 125124 -> Beholder BeholdTV 407 FM [0000:4071]
126125 -> Beholder BeholdTV 409 [0000:4090] 126125 -> Beholder BeholdTV 409 [0000:4090]
127126 -> Beholder BeholdTV 505 FM/RDS [0000:5051,0000:505B,5ace:5050] 127126 -> Beholder BeholdTV 505 FM [5ace:5050]
128127 -> Beholder BeholdTV 507 FM/RDS / BeholdTV 509 FM [0000:5071,0000:507B,5ace:5070,5ace:5090] 128127 -> Beholder BeholdTV 507 FM / BeholdTV 509 FM [5ace:5070,5ace:5090]
129128 -> Beholder BeholdTV Columbus TVFM [0000:5201] 129128 -> Beholder BeholdTV Columbus TVFM [0000:5201]
130129 -> Beholder BeholdTV 607 / BeholdTV 609 [5ace:6070,5ace:6071,5ace:6072,5ace:6073,5ace:6090,5ace:6091,5ace:6092,5ace:6093] 130129 -> Beholder BeholdTV 607 FM [5ace:6070]
131130 -> Beholder BeholdTV M6 [5ace:6190] 131130 -> Beholder BeholdTV M6 [5ace:6190]
132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022] 132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022]
133132 -> Genius TVGO AM11MCE 133132 -> Genius TVGO AM11MCE
@@ -143,7 +143,7 @@
143142 -> Beholder BeholdTV H6 [5ace:6290] 143142 -> Beholder BeholdTV H6 [5ace:6290]
144143 -> Beholder BeholdTV M63 [5ace:6191] 144143 -> Beholder BeholdTV M63 [5ace:6191]
145144 -> Beholder BeholdTV M6 Extra [5ace:6193] 145144 -> Beholder BeholdTV M6 Extra [5ace:6193]
146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636] 146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636,1461:f736]
147146 -> ASUSTeK P7131 Analog 147146 -> ASUSTeK P7131 Analog
148147 -> Asus Tiger 3in1 [1043:4878] 148147 -> Asus Tiger 3in1 [1043:4878]
149148 -> Encore ENLTV-FM v5.3 [1a7f:2008] 149148 -> Encore ENLTV-FM v5.3 [1a7f:2008]
@@ -154,4 +154,16 @@
154153 -> Kworld Plus TV Analog Lite PCI [17de:7128] 154153 -> Kworld Plus TV Analog Lite PCI [17de:7128]
155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] 155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d]
156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708] 156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708]
157156 -> Hauppauge WinTV-HVR1110r3 [0070:6707,0070:6709,0070:670a] 157156 -> Hauppauge WinTV-HVR1110r3 DVB-T/Hybrid [0070:6707,0070:6709,0070:670a]
158157 -> Avermedia AVerTV Studio 507UA [1461:a11b]
159158 -> AVerMedia Cardbus TV/Radio (E501R) [1461:b7e9]
160159 -> Beholder BeholdTV 505 RDS [0000:505B]
161160 -> Beholder BeholdTV 507 RDS [0000:5071]
162161 -> Beholder BeholdTV 507 RDS [0000:507B]
163162 -> Beholder BeholdTV 607 FM [5ace:6071]
164163 -> Beholder BeholdTV 609 FM [5ace:6090]
165164 -> Beholder BeholdTV 609 FM [5ace:6091]
166165 -> Beholder BeholdTV 607 RDS [5ace:6072]
167166 -> Beholder BeholdTV 607 RDS [5ace:6073]
168167 -> Beholder BeholdTV 609 RDS [5ace:6092]
169168 -> Beholder BeholdTV 609 RDS [5ace:6093]
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner
index 691d2f37dc57..be67844074dd 100644
--- a/Documentation/video4linux/CARDLIST.tuner
+++ b/Documentation/video4linux/CARDLIST.tuner
@@ -76,3 +76,5 @@ tuner=75 - Philips TEA5761 FM Radio
76tuner=76 - Xceive 5000 tuner 76tuner=76 - Xceive 5000 tuner
77tuner=77 - TCL tuner MF02GIP-5N-E 77tuner=77 - TCL tuner MF02GIP-5N-E
78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner 78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner
79tuner=79 - Philips PAL/SECAM multi (FM1216 MK5)
80tuner=80 - Philips FQ1216LME MK3 PAL/SECAM w/active loopthrough
diff --git a/Documentation/video4linux/cx18.txt b/Documentation/video4linux/cx18.txt
index 914cb7e734a2..4652c0f5da32 100644
--- a/Documentation/video4linux/cx18.txt
+++ b/Documentation/video4linux/cx18.txt
@@ -11,7 +11,7 @@ encoder chip:
112) Some people have problems getting the i2c bus to work. 112) Some people have problems getting the i2c bus to work.
12 The symptom is that the eeprom cannot be read and the card is 12 The symptom is that the eeprom cannot be read and the card is
13 unusable. This is probably fixed, but if you have problems 13 unusable. This is probably fixed, but if you have problems
14 then post to the video4linux or ivtv-users mailinglist. 14 then post to the video4linux or ivtv-users mailing list.
15 15
163) VBI (raw or sliced) has not yet been implemented. 163) VBI (raw or sliced) has not yet been implemented.
17 17
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 98529e03a46e..2bcf78896e22 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -163,10 +163,11 @@ sunplus 055f:c650 Mustek MDC5500Z
163zc3xx 055f:d003 Mustek WCam300A 163zc3xx 055f:d003 Mustek WCam300A
164zc3xx 055f:d004 Mustek WCam300 AN 164zc3xx 055f:d004 Mustek WCam300 AN
165conex 0572:0041 Creative Notebook cx11646 165conex 0572:0041 Creative Notebook cx11646
166ov519 05a9:0519 OmniVision 166ov519 05a9:0519 OV519 Microphone
167ov519 05a9:0530 OmniVision 167ov519 05a9:0530 OmniVision
168ov519 05a9:4519 OmniVision 168ov519 05a9:4519 Webcam Classic
169ov519 05a9:8519 OmniVision 169ov519 05a9:8519 OmniVision
170ov519 05a9:a518 D-Link DSB-C310 Webcam
170sunplus 05da:1018 Digital Dream Enigma 1.3 171sunplus 05da:1018 Digital Dream Enigma 1.3
171stk014 05e1:0893 Syntek DV4000 172stk014 05e1:0893 Syntek DV4000
172spca561 060b:a001 Maxell Compact Pc PM3 173spca561 060b:a001 Maxell Compact Pc PM3
@@ -178,6 +179,7 @@ spca506 06e1:a190 ADS Instant VCD
178ov534 06f8:3002 Hercules Blog Webcam 179ov534 06f8:3002 Hercules Blog Webcam
179ov534 06f8:3003 Hercules Dualpix HD Weblog 180ov534 06f8:3003 Hercules Dualpix HD Weblog
180sonixj 06f8:3004 Hercules Classic Silver 181sonixj 06f8:3004 Hercules Classic Silver
182sonixj 06f8:3008 Hercules Deluxe Optical Glass
181spca508 0733:0110 ViewQuest VQ110 183spca508 0733:0110 ViewQuest VQ110
182spca508 0130:0130 Clone Digital Webcam 11043 184spca508 0130:0130 Clone Digital Webcam 11043
183spca501 0733:0401 Intel Create and Share 185spca501 0733:0401 Intel Create and Share
@@ -209,6 +211,7 @@ sunplus 08ca:2050 Medion MD 41437
209sunplus 08ca:2060 Aiptek PocketDV5300 211sunplus 08ca:2060 Aiptek PocketDV5300
210tv8532 0923:010f ICM532 cams 212tv8532 0923:010f ICM532 cams
211mars 093a:050f Mars-Semi Pc-Camera 213mars 093a:050f Mars-Semi Pc-Camera
214mr97310a 093a:010f Sakar Digital no. 77379
212pac207 093a:2460 Qtec Webcam 100 215pac207 093a:2460 Qtec Webcam 100
213pac207 093a:2461 HP Webcam 216pac207 093a:2461 HP Webcam
214pac207 093a:2463 Philips SPC 220 NC 217pac207 093a:2463 Philips SPC 220 NC
@@ -265,6 +268,11 @@ sonixj 0c45:60ec SN9C105+MO4000
265sonixj 0c45:60fb Surfer NoName 268sonixj 0c45:60fb Surfer NoName
266sonixj 0c45:60fc LG-LIC300 269sonixj 0c45:60fc LG-LIC300
267sonixj 0c45:60fe Microdia Audio 270sonixj 0c45:60fe Microdia Audio
271sonixj 0c45:6100 PC Camera (SN9C128)
272sonixj 0c45:610a PC Camera (SN9C128)
273sonixj 0c45:610b PC Camera (SN9C128)
274sonixj 0c45:610c PC Camera (SN9C128)
275sonixj 0c45:610e PC Camera (SN9C128)
268sonixj 0c45:6128 Microdia/Sonix SNP325 276sonixj 0c45:6128 Microdia/Sonix SNP325
269sonixj 0c45:612a Avant Camera 277sonixj 0c45:612a Avant Camera
270sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix 278sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt
index b1137f9a53eb..4f6d0ca01956 100644
--- a/Documentation/video4linux/pxa_camera.txt
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -26,6 +26,55 @@ Global video workflow
26 26
27 Once the last buffer is filled in, the QCI interface stops. 27 Once the last buffer is filled in, the QCI interface stops.
28 28
29 c) Capture global finite state machine schema
30
31 +----+ +---+ +----+
32 | DQ | | Q | | DQ |
33 | v | v | v
34 +-----------+ +------------------------+
35 | STOP | | Wait for capture start |
36 +-----------+ Q +------------------------+
37+-> | QCI: stop | ------------------> | QCI: run | <------------+
38| | DMA: stop | | DMA: stop | |
39| +-----------+ +-----> +------------------------+ |
40| / | |
41| / +---+ +----+ | |
42|capture list empty / | Q | | DQ | | QCI Irq EOF |
43| / | v | v v |
44| +--------------------+ +----------------------+ |
45| | DMA hotlink missed | | Capture running | |
46| +--------------------+ +----------------------+ |
47| | QCI: run | +-----> | QCI: run | <-+ |
48| | DMA: stop | / | DMA: run | | |
49| +--------------------+ / +----------------------+ | Other |
50| ^ /DMA still | | channels |
51| | capture list / running | DMA Irq End | not |
52| | not empty / | | finished |
53| | / v | yet |
54| +----------------------+ +----------------------+ | |
55| | Videobuf released | | Channel completed | | |
56| +----------------------+ +----------------------+ | |
57+-- | QCI: run | | QCI: run | --+ |
58 | DMA: run | | DMA: run | |
59 +----------------------+ +----------------------+ |
60 ^ / | |
61 | no overrun / | overrun |
62 | / v |
63 +--------------------+ / +----------------------+ |
64 | Frame completed | / | Frame overran | |
65 +--------------------+ <-----+ +----------------------+ restart frame |
66 | QCI: run | | QCI: stop | --------------+
67 | DMA: run | | DMA: stop |
68 +--------------------+ +----------------------+
69
70 Legend: - each box is a FSM state
71 - each arrow is the condition to transition to another state
72 - an arrow with a comment is a mandatory transition (no condition)
73 - arrow "Q" means : a buffer was enqueued
74 - arrow "DQ" means : a buffer was dequeued
75 - "QCI: stop" means the QCI interface is not enabled
76 - "DMA: stop" means all 3 DMA channels are stopped
77 - "DMA: run" means at least 1 DMA channel is still running
29 78
30DMA usage 79DMA usage
31--------- 80---------
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 854808b67fae..d54c1e4c6a9c 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -89,6 +89,11 @@ from dev (driver name followed by the bus_id, to be precise). If you set it
89up before calling v4l2_device_register then it will be untouched. If dev is 89up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. 90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
91 91
92You can use v4l2_device_set_name() to set the name based on a driver name and
93a driver-global atomic_t instance. This will generate names like ivtv0, ivtv1,
94etc. If the name ends with a digit, then it will insert a dash: cx18-0,
95cx18-1, etc. This function returns the instance number.
96
92The first 'dev' argument is normally the struct device pointer of a pci_dev, 97The first 'dev' argument is normally the struct device pointer of a pci_dev,
93usb_interface or platform_device. It is rare for dev to be NULL, but it happens 98usb_interface or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making 99with ISA devices or when one device creates multiple PCI devices, thus making
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 6f562f778b28..5bd269b3731a 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
2obj- := dummy.o 2obj- := dummy.o
3 3
4# List of programs to build 4# List of programs to build
5hostprogs-y := slabinfo 5hostprogs-y := slabinfo page-types
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
index bd3d31bc4915..c46e68cf9344 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would
75alleviate memory pressure on any zone in the page's node that has fallen below 75alleviate memory pressure on any zone in the page's node that has fallen below
76its watermark. 76its watermark.
77 77
78pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are 78watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
79per-zone fields, used to determine when a zone needs to be balanced. When 79are per-zone fields, used to determine when a zone needs to be balanced. When
80the number of pages falls below pages_min, the hysteric field low_on_memory 80the number of pages falls below watermark[WMARK_MIN], the hysteric field
81gets set. This stays set till the number of free pages becomes pages_high. 81low_on_memory gets set. This stays set till the number of free pages becomes
82When low_on_memory is set, page allocation requests will try to free some 82watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
83pages in the zone (providing GFP_WAIT is set in the request). Orthogonal 83try to free some pages in the zone (providing GFP_WAIT is set in the request).
84to this, is the decision to poke kswapd to free some zone pages. That 84Orthogonal to this, is the decision to poke kswapd to free some zone pages.
85decision is not hysteresis based, and is done when the number of free 85That decision is not hysteresis based, and is done when the number of free
86pages is below pages_low; in which case zone_wake_kswapd is also set. 86pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
87 87
88 88
89(Good) Ideas that I have heard: 89(Good) Ideas that I have heard:
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
new file mode 100644
index 000000000000..0833f44ba16b
--- /dev/null
+++ b/Documentation/vm/page-types.c
@@ -0,0 +1,698 @@
1/*
2 * page-types: Tool for querying page flags
3 *
4 * Copyright (C) 2009 Intel corporation
5 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
6 */
7
8#include <stdio.h>
9#include <stdlib.h>
10#include <unistd.h>
11#include <stdint.h>
12#include <stdarg.h>
13#include <string.h>
14#include <getopt.h>
15#include <limits.h>
16#include <sys/types.h>
17#include <sys/errno.h>
18#include <sys/fcntl.h>
19
20
21/*
22 * kernel page flags
23 */
24
25#define KPF_BYTES 8
26#define PROC_KPAGEFLAGS "/proc/kpageflags"
27
28/* copied from kpageflags_read() */
29#define KPF_LOCKED 0
30#define KPF_ERROR 1
31#define KPF_REFERENCED 2
32#define KPF_UPTODATE 3
33#define KPF_DIRTY 4
34#define KPF_LRU 5
35#define KPF_ACTIVE 6
36#define KPF_SLAB 7
37#define KPF_WRITEBACK 8
38#define KPF_RECLAIM 9
39#define KPF_BUDDY 10
40
41/* [11-20] new additions in 2.6.31 */
42#define KPF_MMAP 11
43#define KPF_ANON 12
44#define KPF_SWAPCACHE 13
45#define KPF_SWAPBACKED 14
46#define KPF_COMPOUND_HEAD 15
47#define KPF_COMPOUND_TAIL 16
48#define KPF_HUGE 17
49#define KPF_UNEVICTABLE 18
50#define KPF_NOPAGE 20
51
52/* [32-] kernel hacking assistances */
53#define KPF_RESERVED 32
54#define KPF_MLOCKED 33
55#define KPF_MAPPEDTODISK 34
56#define KPF_PRIVATE 35
57#define KPF_PRIVATE_2 36
58#define KPF_OWNER_PRIVATE 37
59#define KPF_ARCH 38
60#define KPF_UNCACHED 39
61
62/* [48-] take some arbitrary free slots for expanding overloaded flags
63 * not part of kernel API
64 */
65#define KPF_READAHEAD 48
66#define KPF_SLOB_FREE 49
67#define KPF_SLUB_FROZEN 50
68#define KPF_SLUB_DEBUG 51
69
70#define KPF_ALL_BITS ((uint64_t)~0ULL)
71#define KPF_HACKERS_BITS (0xffffULL << 32)
72#define KPF_OVERLOADED_BITS (0xffffULL << 48)
73#define BIT(name) (1ULL << KPF_##name)
74#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
75
76static char *page_flag_names[] = {
77 [KPF_LOCKED] = "L:locked",
78 [KPF_ERROR] = "E:error",
79 [KPF_REFERENCED] = "R:referenced",
80 [KPF_UPTODATE] = "U:uptodate",
81 [KPF_DIRTY] = "D:dirty",
82 [KPF_LRU] = "l:lru",
83 [KPF_ACTIVE] = "A:active",
84 [KPF_SLAB] = "S:slab",
85 [KPF_WRITEBACK] = "W:writeback",
86 [KPF_RECLAIM] = "I:reclaim",
87 [KPF_BUDDY] = "B:buddy",
88
89 [KPF_MMAP] = "M:mmap",
90 [KPF_ANON] = "a:anonymous",
91 [KPF_SWAPCACHE] = "s:swapcache",
92 [KPF_SWAPBACKED] = "b:swapbacked",
93 [KPF_COMPOUND_HEAD] = "H:compound_head",
94 [KPF_COMPOUND_TAIL] = "T:compound_tail",
95 [KPF_HUGE] = "G:huge",
96 [KPF_UNEVICTABLE] = "u:unevictable",
97 [KPF_NOPAGE] = "n:nopage",
98
99 [KPF_RESERVED] = "r:reserved",
100 [KPF_MLOCKED] = "m:mlocked",
101 [KPF_MAPPEDTODISK] = "d:mappedtodisk",
102 [KPF_PRIVATE] = "P:private",
103 [KPF_PRIVATE_2] = "p:private_2",
104 [KPF_OWNER_PRIVATE] = "O:owner_private",
105 [KPF_ARCH] = "h:arch",
106 [KPF_UNCACHED] = "c:uncached",
107
108 [KPF_READAHEAD] = "I:readahead",
109 [KPF_SLOB_FREE] = "P:slob_free",
110 [KPF_SLUB_FROZEN] = "A:slub_frozen",
111 [KPF_SLUB_DEBUG] = "E:slub_debug",
112};
113
114
115/*
116 * data structures
117 */
118
119static int opt_raw; /* for kernel developers */
120static int opt_list; /* list pages (in ranges) */
121static int opt_no_summary; /* don't show summary */
122static pid_t opt_pid; /* process to walk */
123
124#define MAX_ADDR_RANGES 1024
125static int nr_addr_ranges;
126static unsigned long opt_offset[MAX_ADDR_RANGES];
127static unsigned long opt_size[MAX_ADDR_RANGES];
128
129#define MAX_BIT_FILTERS 64
130static int nr_bit_filters;
131static uint64_t opt_mask[MAX_BIT_FILTERS];
132static uint64_t opt_bits[MAX_BIT_FILTERS];
133
134static int page_size;
135
136#define PAGES_BATCH (64 << 10) /* 64k pages */
137static int kpageflags_fd;
138static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
139
140#define HASH_SHIFT 13
141#define HASH_SIZE (1 << HASH_SHIFT)
142#define HASH_MASK (HASH_SIZE - 1)
143#define HASH_KEY(flags) (flags & HASH_MASK)
144
145static unsigned long total_pages;
146static unsigned long nr_pages[HASH_SIZE];
147static uint64_t page_flags[HASH_SIZE];
148
149
150/*
151 * helper functions
152 */
153
154#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
155
156#define min_t(type, x, y) ({ \
157 type __min1 = (x); \
158 type __min2 = (y); \
159 __min1 < __min2 ? __min1 : __min2; })
160
161unsigned long pages2mb(unsigned long pages)
162{
163 return (pages * page_size) >> 20;
164}
165
166void fatal(const char *x, ...)
167{
168 va_list ap;
169
170 va_start(ap, x);
171 vfprintf(stderr, x, ap);
172 va_end(ap);
173 exit(EXIT_FAILURE);
174}
175
176
177/*
178 * page flag names
179 */
180
181char *page_flag_name(uint64_t flags)
182{
183 static char buf[65];
184 int present;
185 int i, j;
186
187 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
188 present = (flags >> i) & 1;
189 if (!page_flag_names[i]) {
190 if (present)
191 fatal("unkown flag bit %d\n", i);
192 continue;
193 }
194 buf[j++] = present ? page_flag_names[i][0] : '_';
195 }
196
197 return buf;
198}
199
200char *page_flag_longname(uint64_t flags)
201{
202 static char buf[1024];
203 int i, n;
204
205 for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
206 if (!page_flag_names[i])
207 continue;
208 if ((flags >> i) & 1)
209 n += snprintf(buf + n, sizeof(buf) - n, "%s,",
210 page_flag_names[i] + 2);
211 }
212 if (n)
213 n--;
214 buf[n] = '\0';
215
216 return buf;
217}
218
219
220/*
221 * page list and summary
222 */
223
224void show_page_range(unsigned long offset, uint64_t flags)
225{
226 static uint64_t flags0;
227 static unsigned long index;
228 static unsigned long count;
229
230 if (flags == flags0 && offset == index + count) {
231 count++;
232 return;
233 }
234
235 if (count)
236 printf("%lu\t%lu\t%s\n",
237 index, count, page_flag_name(flags0));
238
239 flags0 = flags;
240 index = offset;
241 count = 1;
242}
243
244void show_page(unsigned long offset, uint64_t flags)
245{
246 printf("%lu\t%s\n", offset, page_flag_name(flags));
247}
248
249void show_summary(void)
250{
251 int i;
252
253 printf(" flags\tpage-count MB"
254 " symbolic-flags\t\t\tlong-symbolic-flags\n");
255
256 for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
257 if (nr_pages[i])
258 printf("0x%016llx\t%10lu %8lu %s\t%s\n",
259 (unsigned long long)page_flags[i],
260 nr_pages[i],
261 pages2mb(nr_pages[i]),
262 page_flag_name(page_flags[i]),
263 page_flag_longname(page_flags[i]));
264 }
265
266 printf(" total\t%10lu %8lu\n",
267 total_pages, pages2mb(total_pages));
268}
269
270
271/*
272 * page flag filters
273 */
274
275int bit_mask_ok(uint64_t flags)
276{
277 int i;
278
279 for (i = 0; i < nr_bit_filters; i++) {
280 if (opt_bits[i] == KPF_ALL_BITS) {
281 if ((flags & opt_mask[i]) == 0)
282 return 0;
283 } else {
284 if ((flags & opt_mask[i]) != opt_bits[i])
285 return 0;
286 }
287 }
288
289 return 1;
290}
291
292uint64_t expand_overloaded_flags(uint64_t flags)
293{
294 /* SLOB/SLUB overload several page flags */
295 if (flags & BIT(SLAB)) {
296 if (flags & BIT(PRIVATE))
297 flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
298 if (flags & BIT(ACTIVE))
299 flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
300 if (flags & BIT(ERROR))
301 flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
302 }
303
304 /* PG_reclaim is overloaded as PG_readahead in the read path */
305 if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
306 flags ^= BIT(RECLAIM) | BIT(READAHEAD);
307
308 return flags;
309}
310
311uint64_t well_known_flags(uint64_t flags)
312{
313 /* hide flags intended only for kernel hacker */
314 flags &= ~KPF_HACKERS_BITS;
315
316 /* hide non-hugeTLB compound pages */
317 if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
318 flags &= ~BITS_COMPOUND;
319
320 return flags;
321}
322
323
324/*
325 * page frame walker
326 */
327
328int hash_slot(uint64_t flags)
329{
330 int k = HASH_KEY(flags);
331 int i;
332
333 /* Explicitly reserve slot 0 for flags 0: the following logic
334 * cannot distinguish an unoccupied slot from slot (flags==0).
335 */
336 if (flags == 0)
337 return 0;
338
339 /* search through the remaining (HASH_SIZE-1) slots */
340 for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
341 if (!k || k >= ARRAY_SIZE(page_flags))
342 k = 1;
343 if (page_flags[k] == 0) {
344 page_flags[k] = flags;
345 return k;
346 }
347 if (page_flags[k] == flags)
348 return k;
349 }
350
351 fatal("hash table full: bump up HASH_SHIFT?\n");
352 exit(EXIT_FAILURE);
353}
354
355void add_page(unsigned long offset, uint64_t flags)
356{
357 flags = expand_overloaded_flags(flags);
358
359 if (!opt_raw)
360 flags = well_known_flags(flags);
361
362 if (!bit_mask_ok(flags))
363 return;
364
365 if (opt_list == 1)
366 show_page_range(offset, flags);
367 else if (opt_list == 2)
368 show_page(offset, flags);
369
370 nr_pages[hash_slot(flags)]++;
371 total_pages++;
372}
373
374void walk_pfn(unsigned long index, unsigned long count)
375{
376 unsigned long batch;
377 unsigned long n;
378 unsigned long i;
379
380 if (index > ULONG_MAX / KPF_BYTES)
381 fatal("index overflow: %lu\n", index);
382
383 lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
384
385 while (count) {
386 batch = min_t(unsigned long, count, PAGES_BATCH);
387 n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
388 if (n == 0)
389 break;
390 if (n < 0) {
391 perror(PROC_KPAGEFLAGS);
392 exit(EXIT_FAILURE);
393 }
394
395 if (n % KPF_BYTES != 0)
396 fatal("partial read: %lu bytes\n", n);
397 n = n / KPF_BYTES;
398
399 for (i = 0; i < n; i++)
400 add_page(index + i, kpageflags_buf[i]);
401
402 index += batch;
403 count -= batch;
404 }
405}
406
407void walk_addr_ranges(void)
408{
409 int i;
410
411 kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY);
412 if (kpageflags_fd < 0) {
413 perror(PROC_KPAGEFLAGS);
414 exit(EXIT_FAILURE);
415 }
416
417 if (!nr_addr_ranges)
418 walk_pfn(0, ULONG_MAX);
419
420 for (i = 0; i < nr_addr_ranges; i++)
421 walk_pfn(opt_offset[i], opt_size[i]);
422
423 close(kpageflags_fd);
424}
425
426
427/*
428 * user interface
429 */
430
431const char *page_flag_type(uint64_t flag)
432{
433 if (flag & KPF_HACKERS_BITS)
434 return "(r)";
435 if (flag & KPF_OVERLOADED_BITS)
436 return "(o)";
437 return " ";
438}
439
440void usage(void)
441{
442 int i, j;
443
444 printf(
445"page-types [options]\n"
446" -r|--raw Raw mode, for kernel developers\n"
447" -a|--addr addr-spec Walk a range of pages\n"
448" -b|--bits bits-spec Walk pages with specified bits\n"
449#if 0 /* planned features */
450" -p|--pid pid Walk process address space\n"
451" -f|--file filename Walk file address space\n"
452#endif
453" -l|--list Show page details in ranges\n"
454" -L|--list-each Show page details one by one\n"
455" -N|--no-summary Don't show summay info\n"
456" -h|--help Show this usage message\n"
457"addr-spec:\n"
458" N one page at offset N (unit: pages)\n"
459" N+M pages range from N to N+M-1\n"
460" N,M pages range from N to M-1\n"
461" N, pages range from N to end\n"
462" ,M pages range from 0 to M\n"
463"bits-spec:\n"
464" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
465" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
466" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
467" =bit1,bit2 flags == (bit1|bit2)\n"
468"bit-names:\n"
469 );
470
471 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
472 if (!page_flag_names[i])
473 continue;
474 printf("%16s%s", page_flag_names[i] + 2,
475 page_flag_type(1ULL << i));
476 if (++j > 3) {
477 j = 0;
478 putchar('\n');
479 }
480 }
481 printf("\n "
482 "(r) raw mode bits (o) overloaded bits\n");
483}
484
485unsigned long long parse_number(const char *str)
486{
487 unsigned long long n;
488
489 n = strtoll(str, NULL, 0);
490
491 if (n == 0 && str[0] != '0')
492 fatal("invalid name or number: %s\n", str);
493
494 return n;
495}
496
497void parse_pid(const char *str)
498{
499 opt_pid = parse_number(str);
500}
501
502void parse_file(const char *name)
503{
504}
505
506void add_addr_range(unsigned long offset, unsigned long size)
507{
508 if (nr_addr_ranges >= MAX_ADDR_RANGES)
509 fatal("too much addr ranges\n");
510
511 opt_offset[nr_addr_ranges] = offset;
512 opt_size[nr_addr_ranges] = size;
513 nr_addr_ranges++;
514}
515
516void parse_addr_range(const char *optarg)
517{
518 unsigned long offset;
519 unsigned long size;
520 char *p;
521
522 p = strchr(optarg, ',');
523 if (!p)
524 p = strchr(optarg, '+');
525
526 if (p == optarg) {
527 offset = 0;
528 size = parse_number(p + 1);
529 } else if (p) {
530 offset = parse_number(optarg);
531 if (p[1] == '\0')
532 size = ULONG_MAX;
533 else {
534 size = parse_number(p + 1);
535 if (*p == ',') {
536 if (size < offset)
537 fatal("invalid range: %lu,%lu\n",
538 offset, size);
539 size -= offset;
540 }
541 }
542 } else {
543 offset = parse_number(optarg);
544 size = 1;
545 }
546
547 add_addr_range(offset, size);
548}
549
550void add_bits_filter(uint64_t mask, uint64_t bits)
551{
552 if (nr_bit_filters >= MAX_BIT_FILTERS)
553 fatal("too much bit filters\n");
554
555 opt_mask[nr_bit_filters] = mask;
556 opt_bits[nr_bit_filters] = bits;
557 nr_bit_filters++;
558}
559
560uint64_t parse_flag_name(const char *str, int len)
561{
562 int i;
563
564 if (!*str || !len)
565 return 0;
566
567 if (len <= 8 && !strncmp(str, "compound", len))
568 return BITS_COMPOUND;
569
570 for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
571 if (!page_flag_names[i])
572 continue;
573 if (!strncmp(str, page_flag_names[i] + 2, len))
574 return 1ULL << i;
575 }
576
577 return parse_number(str);
578}
579
580uint64_t parse_flag_names(const char *str, int all)
581{
582 const char *p = str;
583 uint64_t flags = 0;
584
585 while (1) {
586 if (*p == ',' || *p == '=' || *p == '\0') {
587 if ((*str != '~') || (*str == '~' && all && *++str))
588 flags |= parse_flag_name(str, p - str);
589 if (*p != ',')
590 break;
591 str = p + 1;
592 }
593 p++;
594 }
595
596 return flags;
597}
598
599void parse_bits_mask(const char *optarg)
600{
601 uint64_t mask;
602 uint64_t bits;
603 const char *p;
604
605 p = strchr(optarg, '=');
606 if (p == optarg) {
607 mask = KPF_ALL_BITS;
608 bits = parse_flag_names(p + 1, 0);
609 } else if (p) {
610 mask = parse_flag_names(optarg, 0);
611 bits = parse_flag_names(p + 1, 0);
612 } else if (strchr(optarg, '~')) {
613 mask = parse_flag_names(optarg, 1);
614 bits = parse_flag_names(optarg, 0);
615 } else {
616 mask = parse_flag_names(optarg, 0);
617 bits = KPF_ALL_BITS;
618 }
619
620 add_bits_filter(mask, bits);
621}
622
623
624struct option opts[] = {
625 { "raw" , 0, NULL, 'r' },
626 { "pid" , 1, NULL, 'p' },
627 { "file" , 1, NULL, 'f' },
628 { "addr" , 1, NULL, 'a' },
629 { "bits" , 1, NULL, 'b' },
630 { "list" , 0, NULL, 'l' },
631 { "list-each" , 0, NULL, 'L' },
632 { "no-summary", 0, NULL, 'N' },
633 { "help" , 0, NULL, 'h' },
634 { NULL , 0, NULL, 0 }
635};
636
637int main(int argc, char *argv[])
638{
639 int c;
640
641 page_size = getpagesize();
642
643 while ((c = getopt_long(argc, argv,
644 "rp:f:a:b:lLNh", opts, NULL)) != -1) {
645 switch (c) {
646 case 'r':
647 opt_raw = 1;
648 break;
649 case 'p':
650 parse_pid(optarg);
651 break;
652 case 'f':
653 parse_file(optarg);
654 break;
655 case 'a':
656 parse_addr_range(optarg);
657 break;
658 case 'b':
659 parse_bits_mask(optarg);
660 break;
661 case 'l':
662 opt_list = 1;
663 break;
664 case 'L':
665 opt_list = 2;
666 break;
667 case 'N':
668 opt_no_summary = 1;
669 break;
670 case 'h':
671 usage();
672 exit(0);
673 default:
674 usage();
675 exit(1);
676 }
677 }
678
679 if (opt_list == 1)
680 printf("offset\tcount\tflags\n");
681 if (opt_list == 2)
682 printf("offset\tflags\n");
683
684 walk_addr_ranges();
685
686 if (opt_list == 1)
687 show_page_range(0, 0); /* drain the buffer */
688
689 if (opt_no_summary)
690 return 0;
691
692 if (opt_list)
693 printf("\n\n");
694
695 show_summary();
696
697 return 0;
698}
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index ce72c0fe6177..600a304a828c 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -12,9 +12,9 @@ There are three components to pagemap:
12 value for each virtual page, containing the following data (from 12 value for each virtual page, containing the following data (from
13 fs/proc/task_mmu.c, above pagemap_read): 13 fs/proc/task_mmu.c, above pagemap_read):
14 14
15 * Bits 0-55 page frame number (PFN) if present 15 * Bits 0-54 page frame number (PFN) if present
16 * Bits 0-4 swap type if swapped 16 * Bits 0-4 swap type if swapped
17 * Bits 5-55 swap offset if swapped 17 * Bits 5-54 swap offset if swapped
18 * Bits 55-60 page shift (page size = 1<<page shift) 18 * Bits 55-60 page shift (page size = 1<<page shift)
19 * Bit 61 reserved for future use 19 * Bit 61 reserved for future use
20 * Bit 62 page swapped 20 * Bit 62 page swapped
@@ -36,7 +36,7 @@ There are three components to pagemap:
36 * /proc/kpageflags. This file contains a 64-bit set of flags for each 36 * /proc/kpageflags. This file contains a 64-bit set of flags for each
37 page, indexed by PFN. 37 page, indexed by PFN.
38 38
39 The flags are (from fs/proc/proc_misc, above kpageflags_read): 39 The flags are (from fs/proc/page.c, above kpageflags_read):
40 40
41 0. LOCKED 41 0. LOCKED
42 1. ERROR 42 1. ERROR
@@ -49,6 +49,68 @@ There are three components to pagemap:
49 8. WRITEBACK 49 8. WRITEBACK
50 9. RECLAIM 50 9. RECLAIM
51 10. BUDDY 51 10. BUDDY
52 11. MMAP
53 12. ANON
54 13. SWAPCACHE
55 14. SWAPBACKED
56 15. COMPOUND_HEAD
57 16. COMPOUND_TAIL
58 16. HUGE
59 18. UNEVICTABLE
60 20. NOPAGE
61
62Short descriptions to the page flags:
63
64 0. LOCKED
65 page is being locked for exclusive access, eg. by undergoing read/write IO
66
67 7. SLAB
68 page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
69 When compound page is used, SLUB/SLQB will only set this flag on the head
70 page; SLOB will not flag it at all.
71
7210. BUDDY
73 a free memory block managed by the buddy system allocator
74 The buddy system organizes free memory in blocks of various orders.
75 An order N block has 2^N physically contiguous pages, with the BUDDY flag
76 set for and _only_ for the first page.
77
7815. COMPOUND_HEAD
7916. COMPOUND_TAIL
80 A compound page with order N consists of 2^N physically contiguous pages.
81 A compound page with order 2 takes the form of "HTTT", where H donates its
82 head page and T donates its tail page(s). The major consumers of compound
83 pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
84 memory allocators and various device drivers. However in this interface,
85 only huge/giga pages are made visible to end users.
8617. HUGE
87 this is an integral part of a HugeTLB page
88
8920. NOPAGE
90 no page frame exists at the requested address
91
92 [IO related page flags]
93 1. ERROR IO error occurred
94 3. UPTODATE page has up-to-date data
95 ie. for file backed page: (in-memory data revision >= on-disk one)
96 4. DIRTY page has been written to, hence contains new data
97 ie. for file backed page: (in-memory data revision > on-disk one)
98 8. WRITEBACK page is being synced to disk
99
100 [LRU related page flags]
101 5. LRU page is in one of the LRU lists
102 6. ACTIVE page is in the active LRU list
10318. UNEVICTABLE page is in the unevictable (non-)LRU list
104 It is somehow pinned and not a candidate for LRU page reclaims,
105 eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
106 2. REFERENCED page has been referenced since last LRU list enqueue/requeue
107 9. RECLAIM page will be reclaimed soon after its pageout IO completed
10811. MMAP a memory mapped page
10912. ANON a memory mapped page that is not part of a file
11013. SWAPCACHE page is mapped to swap space, ie. has an associated swap entry
11114. SWAPBACKED page is backed by swap/RAM
112
113The page-types tool in this directory can be used to query the above flags.
52 114
53Using pagemap to do something useful: 115Using pagemap to do something useful:
54 116
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index e0203662f9e9..8da3a795083f 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format
50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical 50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
51 pointer to single linked list of struct setup_data. 51 pointer to single linked list of struct setup_data.
52 52
53Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment
54 beyond the kernel_alignment added, new init_size and
55 pref_address fields. Added extended boot loader IDs.
56
53**** MEMORY LAYOUT 57**** MEMORY LAYOUT
54 58
55The traditional memory map for the kernel loader, used for Image or 59The traditional memory map for the kernel loader, used for Image or
@@ -168,12 +172,13 @@ Offset Proto Name Meaning
168021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 172021C/4 2.00+ ramdisk_size initrd size (set by boot loader)
1690220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 1730220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only
1700224/2 2.01+ heap_end_ptr Free memory after setup end 1740224/2 2.01+ heap_end_ptr Free memory after setup end
1710226/2 N/A pad1 Unused 1750226/1 2.02+(3 ext_loader_ver Extended boot loader version
1760227/1 2.02+(3 ext_loader_type Extended boot loader ID
1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 1770228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
173022C/4 2.03+ ramdisk_max Highest legal initrd address 178022C/4 2.03+ ramdisk_max Highest legal initrd address
1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 1790230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1800234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1760235/1 N/A pad2 Unused 1810235/1 2.10+ min_alignment Minimum alignment, as a power of two
1770236/2 N/A pad3 Unused 1820236/2 N/A pad3 Unused
1780238/4 2.06+ cmdline_size Maximum size of the kernel command line 1830238/4 2.06+ cmdline_size Maximum size of the kernel command line
179023C/4 2.07+ hardware_subarch Hardware subarchitecture 184023C/4 2.07+ hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset Proto Name Meaning
182024C/4 2.08+ payload_length Length of kernel payload 187024C/4 2.08+ payload_length Length of kernel payload
1830250/8 2.09+ setup_data 64-bit physical pointer to linked list 1880250/8 2.09+ setup_data 64-bit physical pointer to linked list
184 of struct setup_data 189 of struct setup_data
1900258/8 2.10+ pref_address Preferred loading address
1910260/4 2.10+ init_size Linear memory required during initialization
185 192
186(1) For backwards compatibility, if the setup_sects field contains 0, the 193(1) For backwards compatibility, if the setup_sects field contains 0, the
187 real value is 4. 194 real value is 4.
@@ -190,6 +197,8 @@ Offset Proto Name Meaning
190 field are unusable, which means the size of a bzImage kernel 197 field are unusable, which means the size of a bzImage kernel
191 cannot be determined. 198 cannot be determined.
192 199
200(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
201
193If the "HdrS" (0x53726448) magic number is not found at offset 0x202, 202If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
194the boot protocol version is "old". Loading an old kernel, the 203the boot protocol version is "old". Loading an old kernel, the
195following parameters should be assumed: 204following parameters should be assumed:
@@ -343,18 +352,32 @@ Protocol: 2.00+
343 0xTV here, where T is an identifier for the boot loader and V is 352 0xTV here, where T is an identifier for the boot loader and V is
344 a version number. Otherwise, enter 0xFF here. 353 a version number. Otherwise, enter 0xFF here.
345 354
355 For boot loader IDs above T = 0xD, write T = 0xE to this field and
356 write the extended ID minus 0x10 to the ext_loader_type field.
357 Similarly, the ext_loader_ver field can be used to provide more than
358 four bits for the bootloader version.
359
360 For example, for T = 0x15, V = 0x234, write:
361
362 type_of_loader <- 0xE4
363 ext_loader_type <- 0x05
364 ext_loader_ver <- 0x23
365
346 Assigned boot loader ids: 366 Assigned boot loader ids:
347 0 LILO (0x00 reserved for pre-2.00 bootloader) 367 0 LILO (0x00 reserved for pre-2.00 bootloader)
348 1 Loadlin 368 1 Loadlin
349 2 bootsect-loader (0x20, all other values reserved) 369 2 bootsect-loader (0x20, all other values reserved)
350 3 SYSLINUX 370 3 Syslinux
351 4 EtherBoot 371 4 Etherboot/gPXE
352 5 ELILO 372 5 ELILO
353 7 GRUB 373 7 GRUB
354 8 U-BOOT 374 8 U-Boot
355 9 Xen 375 9 Xen
356 A Gujin 376 A Gujin
357 B Qemu 377 B Qemu
378 C Arcturus Networks uCbootloader
379 E Extended (see ext_loader_type)
380 F Special (0xFF = undefined)
358 381
359 Please contact <hpa@zytor.com> if you need a bootloader ID 382 Please contact <hpa@zytor.com> if you need a bootloader ID
360 value assigned. 383 value assigned.
@@ -453,6 +476,35 @@ Protocol: 2.01+
453 Set this field to the offset (from the beginning of the real-mode 476 Set this field to the offset (from the beginning of the real-mode
454 code) of the end of the setup stack/heap, minus 0x0200. 477 code) of the end of the setup stack/heap, minus 0x0200.
455 478
479Field name: ext_loader_ver
480Type: write (optional)
481Offset/size: 0x226/1
482Protocol: 2.02+
483
484 This field is used as an extension of the version number in the
485 type_of_loader field. The total version number is considered to be
486 (type_of_loader & 0x0f) + (ext_loader_ver << 4).
487
488 The use of this field is boot loader specific. If not written, it
489 is zero.
490
491 Kernels prior to 2.6.31 did not recognize this field, but it is safe
492 to write for protocol version 2.02 or higher.
493
494Field name: ext_loader_type
495Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0)
496Offset/size: 0x227/1
497Protocol: 2.02+
498
499 This field is used as an extension of the type number in
500 type_of_loader field. If the type in type_of_loader is 0xE, then
501 the actual type is (ext_loader_type + 0x10).
502
503 This field is ignored if the type in type_of_loader is not 0xE.
504
505 Kernels prior to 2.6.31 did not recognize this field, but it is safe
506 to write for protocol version 2.02 or higher.
507
456Field name: cmd_line_ptr 508Field name: cmd_line_ptr
457Type: write (obligatory) 509Type: write (obligatory)
458Offset/size: 0x228/4 510Offset/size: 0x228/4
@@ -482,11 +534,19 @@ Protocol: 2.03+
482 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) 534 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
483 535
484Field name: kernel_alignment 536Field name: kernel_alignment
485Type: read (reloc) 537Type: read/modify (reloc)
486Offset/size: 0x230/4 538Offset/size: 0x230/4
487Protocol: 2.05+ 539Protocol: 2.05+ (read), 2.10+ (modify)
540
541 Alignment unit required by the kernel (if relocatable_kernel is
542 true.) A relocatable kernel that is loaded at an alignment
543 incompatible with the value in this field will be realigned during
544 kernel initialization.
488 545
489 Alignment unit required by the kernel (if relocatable_kernel is true.) 546 Starting with protocol version 2.10, this reflects the kernel
547 alignment preferred for optimal performance; it is possible for the
548 loader to modify this field to permit a lesser alignment. See the
549 min_alignment and pref_address field below.
490 550
491Field name: relocatable_kernel 551Field name: relocatable_kernel
492Type: read (reloc) 552Type: read (reloc)
@@ -498,6 +558,22 @@ Protocol: 2.05+
498 After loading, the boot loader must set the code32_start field to 558 After loading, the boot loader must set the code32_start field to
499 point to the loaded code, or to a boot loader hook. 559 point to the loaded code, or to a boot loader hook.
500 560
561Field name: min_alignment
562Type: read (reloc)
563Offset/size: 0x235/1
564Protocol: 2.10+
565
566 This field, if nonzero, indicates as a power of two the minimum
567 alignment required, as opposed to preferred, by the kernel to boot.
568 If a boot loader makes use of this field, it should update the
569 kernel_alignment field with the alignment unit desired; typically:
570
571 kernel_alignment = 1 << min_alignment
572
573 There may be a considerable performance cost with an excessively
574 misaligned kernel. Therefore, a loader should typically try each
575 power-of-two alignment from kernel_alignment down to this alignment.
576
501Field name: cmdline_size 577Field name: cmdline_size
502Type: read 578Type: read
503Offset/size: 0x238/4 579Offset/size: 0x238/4
@@ -582,6 +658,36 @@ Protocol: 2.09+
582 sure to consider the case where the linked list already contains 658 sure to consider the case where the linked list already contains
583 entries. 659 entries.
584 660
661Field name: pref_address
662Type: read (reloc)
663Offset/size: 0x258/8
664Protocol: 2.10+
665
666 This field, if nonzero, represents a preferred load address for the
667 kernel. A relocating bootloader should attempt to load at this
668 address if possible.
669
670 A non-relocatable kernel will unconditionally move itself and to run
671 at this address.
672
673Field name: init_size
674Type: read
675Offset/size: 0x25c/4
676
677 This field indicates the amount of linear contiguous memory starting
678 at the kernel runtime start address that the kernel needs before it
679 is capable of examining its memory map. This is not the same thing
680 as the total amount of memory the kernel needs to boot, but it can
681 be used by a relocating boot loader to help select a safe load
682 address for the kernel.
683
684 The kernel runtime start address is determined by the following algorithm:
685
686 if (relocatable_kernel)
687 runtime_start = align_up(load_address, kernel_alignment)
688 else
689 runtime_start = pref_address
690
585 691
586**** THE IMAGE CHECKSUM 692**** THE IMAGE CHECKSUM
587 693
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a718..29a6ff8bc7d3 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -5,21 +5,51 @@ only the AMD64 specific ones are listed here.
5 5
6Machine check 6Machine check
7 7
8 mce=off disable machine check 8 Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
9 mce=bootlog Enable logging of machine checks left over from booting. 9
10 Disabled by default on AMD because some BIOS leave bogus ones. 10 mce=off
11 If your BIOS doesn't do that it's a good idea to enable though 11 Disable machine check
12 to make sure you log even machine check events that result 12 mce=no_cmci
13 in a reboot. On Intel systems it is enabled by default. 13 Disable CMCI(Corrected Machine Check Interrupt) that
14 Intel processor supports. Usually this disablement is
15 not recommended, but it might be handy if your hardware
16 is misbehaving.
17 Note that you'll get more problems without CMCI than with
18 due to the shared banks, i.e. you might get duplicated
19 error logs.
20 mce=dont_log_ce
21 Don't make logs for corrected errors. All events reported
22 as corrected are silently cleared by OS.
23 This option will be useful if you have no interest in any
24 of corrected errors.
25 mce=ignore_ce
26 Disable features for corrected errors, e.g. polling timer
27 and CMCI. All events reported as corrected are not cleared
28 by OS and remained in its error banks.
29 Usually this disablement is not recommended, however if
30 there is an agent checking/clearing corrected errors
31 (e.g. BIOS or hardware monitoring applications), conflicting
32 with OS's error handling, and you cannot deactivate the agent,
33 then this option will be a help.
34 mce=bootlog
35 Enable logging of machine checks left over from booting.
36 Disabled by default on AMD because some BIOS leave bogus ones.
37 If your BIOS doesn't do that it's a good idea to enable though
38 to make sure you log even machine check events that result
39 in a reboot. On Intel systems it is enabled by default.
14 mce=nobootlog 40 mce=nobootlog
15 Disable boot machine check logging. 41 Disable boot machine check logging.
16 mce=tolerancelevel (number) 42 mce=tolerancelevel[,monarchtimeout] (number,number)
43 tolerance levels:
17 0: always panic on uncorrected errors, log corrected errors 44 0: always panic on uncorrected errors, log corrected errors
18 1: panic or SIGBUS on uncorrected errors, log corrected errors 45 1: panic or SIGBUS on uncorrected errors, log corrected errors
19 2: SIGBUS or log uncorrected errors, log corrected errors 46 2: SIGBUS or log uncorrected errors, log corrected errors
20 3: never panic or SIGBUS, log all errors (for testing only) 47 3: never panic or SIGBUS, log all errors (for testing only)
21 Default is 1 48 Default is 1
22 Can be also set using sysfs which is preferable. 49 Can be also set using sysfs which is preferable.
50 monarchtimeout:
51 Sets the time in us to wait for other CPUs on machine checks. 0
52 to disable.
23 53
24 nomce (for compatibility with i386): same as mce=off 54 nomce (for compatibility with i386): same as mce=off
25 55
@@ -150,11 +180,6 @@ NUMA
150 Otherwise, the remaining system RAM is allocated to an 180 Otherwise, the remaining system RAM is allocated to an
151 additional node. 181 additional node.
152 182
153 numa=hotadd=percent
154 Only allow hotadd memory to preallocate page structures upto
155 percent of already available memory.
156 numa=hotadd=0 will disable hotadd memory.
157
158ACPI 183ACPI
159 184
160 acpi=off Don't enable ACPI 185 acpi=off Don't enable ACPI
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
index a05e58e7b159..b1fb30273286 100644
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -41,7 +41,9 @@ check_interval
41 the polling interval. When the poller stops finding MCEs, it 41 the polling interval. When the poller stops finding MCEs, it
42 triggers an exponential backoff (poll less often) on the polling 42 triggers an exponential backoff (poll less often) on the polling
43 interval. The check_interval variable is both the initial and 43 interval. The check_interval variable is both the initial and
44 maximum polling interval. 44 maximum polling interval. 0 means no polling for corrected machine
45 check errors (but some corrected errors might be still reported
46 in other ways)
45 47
46tolerant 48tolerant
47 Tolerance level. When a machine check exception occurs for a non 49 Tolerance level. When a machine check exception occurs for a non
@@ -67,6 +69,10 @@ trigger
67 Program to run when a machine check event is detected. 69 Program to run when a machine check event is detected.
68 This is an alternative to running mcelog regularly from cron 70 This is an alternative to running mcelog regularly from cron
69 and allows to detect events faster. 71 and allows to detect events faster.
72monarch_timeout
73 How long to wait for the other CPUs to machine check too on a
74 exception. 0 to disable waiting for other CPUs.
75 Unit: us
70 76
71TBD document entries for AMD threshold interrupt configuration 77TBD document entries for AMD threshold interrupt configuration
72 78
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 29b52b14d0b4..d6498e3cd713 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
7hole caused by [48:63] sign extension 7hole caused by [48:63] sign extension
8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole 8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
9ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory 9ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
10ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole 10ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space 11ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
12ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) 12ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
13ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
13... unused hole ... 14... unused hole ...
14ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 15ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
15ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space 16ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space