aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX2
-rw-r--r--Documentation/ABI/testing/sysfs-block37
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci17
-rw-r--r--Documentation/ABI/testing/sysfs-class-mtd125
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext410
-rw-r--r--Documentation/ABI/testing/sysfs-pps73
-rw-r--r--Documentation/Changes7
-rw-r--r--Documentation/DocBook/debugobjects.tmpl2
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl4
-rw-r--r--Documentation/DocBook/mac80211.tmpl2
-rw-r--r--Documentation/DocBook/uio-howto.tmpl163
-rw-r--r--Documentation/PCI/pci-error-recovery.txt119
-rw-r--r--Documentation/PCI/pcieaer-howto.txt25
-rw-r--r--Documentation/RCU/RTFP.txt77
-rw-r--r--Documentation/RCU/UP.txt34
-rw-r--r--Documentation/RCU/checklist.txt20
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/rcubarrier.txt7
-rw-r--r--Documentation/RCU/rculist_nulls.txt7
-rw-r--r--Documentation/RCU/torture.txt23
-rw-r--r--Documentation/RCU/trace.txt7
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/accounting/getdelays.c3
-rw-r--r--Documentation/arm/SA1100/ADSBitsy2
-rw-r--r--Documentation/arm/SA1100/Assabet2
-rw-r--r--Documentation/arm/SA1100/Brutus2
-rw-r--r--Documentation/arm/SA1100/GraphicsClient4
-rw-r--r--Documentation/arm/SA1100/GraphicsMaster4
-rw-r--r--Documentation/arm/SA1100/Victor2
-rw-r--r--Documentation/arm/Samsung-S3C24XX/CPUfreq.txt75
-rw-r--r--Documentation/arm/memory.txt2
-rw-r--r--Documentation/atomic_ops.txt4
-rw-r--r--Documentation/block/data-integrity.txt4
-rw-r--r--Documentation/btmrvl.txt119
-rw-r--r--Documentation/cdrom/packet-writing.txt2
-rw-r--r--Documentation/cgroups/cpusets.txt12
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/connector/Makefile5
-rw-r--r--Documentation/connector/cn_test.c44
-rw-r--r--Documentation/connector/connector.txt119
-rw-r--r--Documentation/connector/ucon.c64
-rw-r--r--Documentation/cpu-freq/cpu-drivers.txt2
-rw-r--r--Documentation/cpu-freq/governors.txt26
-rw-r--r--Documentation/cpu-freq/user-guide.txt1
-rw-r--r--Documentation/device-mapper/dm-log.txt54
-rw-r--r--Documentation/device-mapper/dm-queue-length.txt39
-rw-r--r--Documentation/device-mapper/dm-service-time.txt91
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/driver-model/device.txt32
-rw-r--r--Documentation/driver-model/driver.txt4
-rw-r--r--Documentation/dvb/get_dvb_firmware61
-rw-r--r--Documentation/fault-injection/fault-injection.txt70
-rw-r--r--Documentation/fb/vesafb.txt2
-rw-r--r--Documentation/feature-removal-schedule.txt139
-rw-r--r--Documentation/filesystems/00-INDEX4
-rw-r--r--Documentation/filesystems/9p.txt3
-rw-r--r--Documentation/filesystems/Locking45
-rw-r--r--Documentation/filesystems/afs.txt26
-rw-r--r--Documentation/filesystems/ext2.txt2
-rw-r--r--Documentation/filesystems/ext4.txt4
-rw-r--r--Documentation/filesystems/gfs2-uevents.txt100
-rw-r--r--Documentation/filesystems/isofs.txt9
-rw-r--r--Documentation/filesystems/nfs.txt98
-rw-r--r--Documentation/filesystems/proc.txt268
-rw-r--r--Documentation/filesystems/seq_file.txt2
-rw-r--r--Documentation/filesystems/sysfs.txt3
-rw-r--r--Documentation/firmware_class/README3
-rw-r--r--Documentation/flexible-arrays.txt99
-rw-r--r--Documentation/gcov.txt253
-rw-r--r--Documentation/hwmon/pcf859128
-rw-r--r--Documentation/hwmon/tmp42136
-rw-r--r--Documentation/i2c/instantiating-devices44
-rw-r--r--Documentation/i2c/writing-clients16
-rw-r--r--Documentation/input/input.txt2
-rw-r--r--Documentation/input/rotary-encoder.txt9
-rw-r--r--Documentation/input/sentelic.txt475
-rw-r--r--Documentation/intel_txt.txt210
-rw-r--r--Documentation/ioctl/ioctl-number.txt6
-rw-r--r--Documentation/isdn/00-INDEX19
-rw-r--r--Documentation/ja_JP/SubmitChecklist2
-rw-r--r--Documentation/kernel-parameters.txt160
-rw-r--r--Documentation/keys.txt39
-rw-r--r--Documentation/kmemcheck.txt773
-rw-r--r--Documentation/kmemleak.txt54
-rw-r--r--Documentation/kprobes.txt6
-rw-r--r--Documentation/kvm/api.txt759
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt174
-rw-r--r--Documentation/leds-lp3944.txt50
-rw-r--r--Documentation/lguest/lguest.c721
-rw-r--r--Documentation/lockdep-design.txt6
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/6pack.txt2
-rw-r--r--Documentation/networking/ieee802154.txt20
-rw-r--r--Documentation/networking/ip-sysctl.txt47
-rw-r--r--Documentation/power/runtime_pm.txt378
-rw-r--r--Documentation/powerpc/booting-without-of.txt1168
-rw-r--r--Documentation/powerpc/dts-bindings/4xx/emac.txt148
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/esdhc.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/gpio.txt50
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/led.txt17
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/mdio.txt19
-rw-r--r--Documentation/powerpc/dts-bindings/marvell.txt521
-rw-r--r--Documentation/powerpc/dts-bindings/phy.txt25
-rw-r--r--Documentation/powerpc/dts-bindings/spi-bus.txt57
-rw-r--r--Documentation/powerpc/dts-bindings/usb-ehci.txt25
-rw-r--r--Documentation/powerpc/dts-bindings/xilinx.txt295
-rw-r--r--Documentation/pps/pps.txt172
-rw-r--r--Documentation/rfkill.txt139
-rw-r--r--Documentation/robust-futex-ABI.txt4
-rw-r--r--Documentation/s390/s390dbf.txt7
-rw-r--r--Documentation/scheduler/sched-rt-group.txt13
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt14
-rw-r--r--Documentation/scsi/scsi_mid_low_api.txt5
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt30
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt35
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt64
-rw-r--r--Documentation/sound/alsa/Procfile.txt5
-rw-r--r--Documentation/spi/spidev_test.c10
-rw-r--r--Documentation/sysctl/kernel.txt16
-rw-r--r--Documentation/sysctl/vm.txt23
-rw-r--r--Documentation/sysrq.txt7
-rw-r--r--Documentation/trace/events.txt217
-rw-r--r--Documentation/trace/ftrace-design.txt233
-rw-r--r--Documentation/trace/ftrace.txt299
-rw-r--r--Documentation/trace/function-graph-fold.vim42
-rw-r--r--Documentation/trace/mmiotrace.txt26
-rw-r--r--Documentation/trace/ring-buffer-design.txt955
-rw-r--r--Documentation/vgaarbiter.txt194
-rw-r--r--Documentation/video4linux/CARDLIST.cx238857
-rw-r--r--Documentation/video4linux/CARDLIST.cx889
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx17
-rw-r--r--Documentation/video4linux/CARDLIST.saa713428
-rw-r--r--Documentation/video4linux/CARDLIST.tuner3
-rw-r--r--Documentation/video4linux/CQcam.txt4
-rw-r--r--Documentation/video4linux/gspca.txt50
-rw-r--r--Documentation/video4linux/pxa_camera.txt49
-rw-r--r--Documentation/video4linux/si4713.txt176
-rw-r--r--Documentation/video4linux/v4l2-framework.txt29
-rw-r--r--Documentation/vm/Makefile2
-rw-r--r--Documentation/vm/balance18
-rw-r--r--Documentation/vm/page-types.c698
-rw-r--r--Documentation/vm/pagemap.txt68
-rw-r--r--Documentation/vm/slub.txt10
-rw-r--r--Documentation/watchdog/hpwdt.txt95
-rw-r--r--Documentation/x86/00-INDEX2
-rw-r--r--Documentation/x86/exception-tables.txt (renamed from Documentation/exception.txt)202
-rw-r--r--Documentation/x86/zero-page.txt1
148 files changed, 10640 insertions, 2424 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index d05737aaa84b..06b982affe76 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -82,6 +82,8 @@ block/
82 - info on the Block I/O (BIO) layer. 82 - info on the Block I/O (BIO) layer.
83blockdev/ 83blockdev/
84 - info on block devices & drivers 84 - info on block devices & drivers
85btmrvl.txt
86 - info on Marvell Bluetooth driver usage.
85cachetlb.txt 87cachetlb.txt
86 - describes the cache/TLB flushing interfaces Linux uses. 88 - describes the cache/TLB flushing interfaces Linux uses.
87cdrom/ 89cdrom/
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index cbbd3e069945..5f3bedaf8e35 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -94,28 +94,37 @@ What: /sys/block/<disk>/queue/physical_block_size
94Date: May 2009 94Date: May 2009
95Contact: Martin K. Petersen <martin.petersen@oracle.com> 95Contact: Martin K. Petersen <martin.petersen@oracle.com>
96Description: 96Description:
97 This is the smallest unit the storage device can write 97 This is the smallest unit a physical storage device can
98 without resorting to read-modify-write operation. It is 98 write atomically. It is usually the same as the logical
99 usually the same as the logical block size but may be 99 block size but may be bigger. One example is SATA
100 bigger. One example is SATA drives with 4KB sectors 100 drives with 4KB sectors that expose a 512-byte logical
101 that expose a 512-byte logical block size to the 101 block size to the operating system. For stacked block
102 operating system. 102 devices the physical_block_size variable contains the
103 maximum physical_block_size of the component devices.
103 104
104What: /sys/block/<disk>/queue/minimum_io_size 105What: /sys/block/<disk>/queue/minimum_io_size
105Date: April 2009 106Date: April 2009
106Contact: Martin K. Petersen <martin.petersen@oracle.com> 107Contact: Martin K. Petersen <martin.petersen@oracle.com>
107Description: 108Description:
108 Storage devices may report a preferred minimum I/O size, 109 Storage devices may report a granularity or preferred
109 which is the smallest request the device can perform 110 minimum I/O size which is the smallest request the
110 without incurring a read-modify-write penalty. For disk 111 device can perform without incurring a performance
111 drives this is often the physical block size. For RAID 112 penalty. For disk drives this is often the physical
112 arrays it is often the stripe chunk size. 113 block size. For RAID arrays it is often the stripe
114 chunk size. A properly aligned multiple of
115 minimum_io_size is the preferred request size for
116 workloads where a high number of I/O operations is
117 desired.
113 118
114What: /sys/block/<disk>/queue/optimal_io_size 119What: /sys/block/<disk>/queue/optimal_io_size
115Date: April 2009 120Date: April 2009
116Contact: Martin K. Petersen <martin.petersen@oracle.com> 121Contact: Martin K. Petersen <martin.petersen@oracle.com>
117Description: 122Description:
118 Storage devices may report an optimal I/O size, which is 123 Storage devices may report an optimal I/O size, which is
119 the device's preferred unit of receiving I/O. This is 124 the device's preferred unit for sustained I/O. This is
120 rarely reported for disk drives. For RAID devices it is 125 rarely reported for disk drives. For RAID arrays it is
121 usually the stripe width or the internal block size. 126 usually the stripe width or the internal track size. A
127 properly aligned multiple of optimal_io_size is the
128 preferred request size for workloads where sustained
129 throughput is desired. If no optimal I/O size is
130 reported this file contains 0.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 97ad190e13af..25be3250f7d6 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -84,6 +84,16 @@ Description:
84 from this part of the device tree. 84 from this part of the device tree.
85 Depends on CONFIG_HOTPLUG. 85 Depends on CONFIG_HOTPLUG.
86 86
87What: /sys/bus/pci/devices/.../reset
88Date: July 2009
89Contact: Michael S. Tsirkin <mst@redhat.com>
90Description:
91 Some devices allow an individual function to be reset
92 without affecting other functions in the same device.
93 For devices that have this support, a file named reset
94 will be present in sysfs. Writing 1 to this file
95 will perform reset.
96
87What: /sys/bus/pci/devices/.../vpd 97What: /sys/bus/pci/devices/.../vpd
88Date: February 2008 98Date: February 2008
89Contact: Ben Hutchings <bhutchings@solarflare.com> 99Contact: Ben Hutchings <bhutchings@solarflare.com>
@@ -122,3 +132,10 @@ Description:
122 This symbolic link appears when a device is a Virtual Function. 132 This symbolic link appears when a device is a Virtual Function.
123 The symbolic link points to the PCI device sysfs entry of the 133 The symbolic link points to the PCI device sysfs entry of the
124 Physical Function this device associates with. 134 Physical Function this device associates with.
135
136What: /sys/bus/pci/slots/.../module
137Date: June 2009
138Contact: linux-pci@vger.kernel.org
139Description:
140 This symbolic link points to the PCI hotplug controller driver
141 module that manages the hotplug slot.
diff --git a/Documentation/ABI/testing/sysfs-class-mtd b/Documentation/ABI/testing/sysfs-class-mtd
new file mode 100644
index 000000000000..4d55a1888981
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-mtd
@@ -0,0 +1,125 @@
1What: /sys/class/mtd/
2Date: April 2009
3KernelVersion: 2.6.29
4Contact: linux-mtd@lists.infradead.org
5Description:
6 The mtd/ class subdirectory belongs to the MTD subsystem
7 (MTD core).
8
9What: /sys/class/mtd/mtdX/
10Date: April 2009
11KernelVersion: 2.6.29
12Contact: linux-mtd@lists.infradead.org
13Description:
14 The /sys/class/mtd/mtd{0,1,2,3,...} directories correspond
15 to each /dev/mtdX character device. These may represent
16 physical/simulated flash devices, partitions on a flash
17 device, or concatenated flash devices. They exist regardless
18 of whether CONFIG_MTD_CHAR is actually enabled.
19
20What: /sys/class/mtd/mtdXro/
21Date: April 2009
22KernelVersion: 2.6.29
23Contact: linux-mtd@lists.infradead.org
24Description:
25 These directories provide the corresponding read-only device
26 nodes for /sys/class/mtd/mtdX/ . They are only created
27 (for the benefit of udev) if CONFIG_MTD_CHAR is enabled.
28
29What: /sys/class/mtd/mtdX/dev
30Date: April 2009
31KernelVersion: 2.6.29
32Contact: linux-mtd@lists.infradead.org
33Description:
34 Major and minor numbers of the character device corresponding
35 to this MTD device (in <major>:<minor> format). This is the
36 read-write device so <minor> will be even.
37
38What: /sys/class/mtd/mtdXro/dev
39Date: April 2009
40KernelVersion: 2.6.29
41Contact: linux-mtd@lists.infradead.org
42Description:
43 Major and minor numbers of the character device corresponding
44 to the read-only variant of thie MTD device (in
45 <major>:<minor> format). In this case <minor> will be odd.
46
47What: /sys/class/mtd/mtdX/erasesize
48Date: April 2009
49KernelVersion: 2.6.29
50Contact: linux-mtd@lists.infradead.org
51Description:
52 "Major" erase size for the device. If numeraseregions is
53 zero, this is the eraseblock size for the entire device.
54 Otherwise, the MEMGETREGIONCOUNT/MEMGETREGIONINFO ioctls
55 can be used to determine the actual eraseblock layout.
56
57What: /sys/class/mtd/mtdX/flags
58Date: April 2009
59KernelVersion: 2.6.29
60Contact: linux-mtd@lists.infradead.org
61Description:
62 A hexadecimal value representing the device flags, ORed
63 together:
64
65 0x0400: MTD_WRITEABLE - device is writable
66 0x0800: MTD_BIT_WRITEABLE - single bits can be flipped
67 0x1000: MTD_NO_ERASE - no erase necessary
68 0x2000: MTD_POWERUP_LOCK - always locked after reset
69
70What: /sys/class/mtd/mtdX/name
71Date: April 2009
72KernelVersion: 2.6.29
73Contact: linux-mtd@lists.infradead.org
74Description:
75 A human-readable ASCII name for the device or partition.
76 This will match the name in /proc/mtd .
77
78What: /sys/class/mtd/mtdX/numeraseregions
79Date: April 2009
80KernelVersion: 2.6.29
81Contact: linux-mtd@lists.infradead.org
82Description:
83 For devices that have variable eraseblock sizes, this
84 provides the total number of erase regions. Otherwise,
85 it will read back as zero.
86
87What: /sys/class/mtd/mtdX/oobsize
88Date: April 2009
89KernelVersion: 2.6.29
90Contact: linux-mtd@lists.infradead.org
91Description:
92 Number of OOB bytes per page.
93
94What: /sys/class/mtd/mtdX/size
95Date: April 2009
96KernelVersion: 2.6.29
97Contact: linux-mtd@lists.infradead.org
98Description:
99 Total size of the device/partition, in bytes.
100
101What: /sys/class/mtd/mtdX/type
102Date: April 2009
103KernelVersion: 2.6.29
104Contact: linux-mtd@lists.infradead.org
105Description:
106 One of the following ASCII strings, representing the device
107 type:
108
109 absent, ram, rom, nor, nand, dataflash, ubi, unknown
110
111What: /sys/class/mtd/mtdX/writesize
112Date: April 2009
113KernelVersion: 2.6.29
114Contact: linux-mtd@lists.infradead.org
115Description:
116 Minimal writable flash unit size. This will always be
117 a positive integer.
118
119 In the case of NOR flash it is 1 (even though individual
120 bits can be cleared).
121
122 In the case of NAND flash it is one NAND page (or a
123 half page, or a quarter page).
124
125 In the case of ECC NOR, it is the ECC block size.
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 4e79074de282..5fb709997d96 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -79,3 +79,13 @@ Description:
79 This file is read-only and shows the number of 79 This file is read-only and shows the number of
80 kilobytes of data that have been written to this 80 kilobytes of data that have been written to this
81 filesystem since it was mounted. 81 filesystem since it was mounted.
82
83What: /sys/fs/ext4/<disk>/inode_goal
84Date: June 2008
85Contact: "Theodore Ts'o" <tytso@mit.edu>
86Description:
87 Tuning parameter which (if non-zero) controls the goal
88 inode used by the inode allocator in p0reference to
89 all other allocation hueristics. This is intended for
90 debugging use only, and should be 0 on production
91 systems.
diff --git a/Documentation/ABI/testing/sysfs-pps b/Documentation/ABI/testing/sysfs-pps
new file mode 100644
index 000000000000..25028c7bc37d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-pps
@@ -0,0 +1,73 @@
1What: /sys/class/pps/
2Date: February 2008
3Contact: Rodolfo Giometti <giometti@linux.it>
4Description:
5 The /sys/class/pps/ directory will contain files and
6 directories that will provide a unified interface to
7 the PPS sources.
8
9What: /sys/class/pps/ppsX/
10Date: February 2008
11Contact: Rodolfo Giometti <giometti@linux.it>
12Description:
13 The /sys/class/pps/ppsX/ directory is related to X-th
14 PPS source into the system. Each directory will
15 contain files to manage and control its PPS source.
16
17What: /sys/class/pps/ppsX/assert
18Date: February 2008
19Contact: Rodolfo Giometti <giometti@linux.it>
20Description:
21 The /sys/class/pps/ppsX/assert file reports the assert events
22 and the assert sequence number of the X-th source in the form:
23
24 <secs>.<nsec>#<sequence>
25
26 If the source has no assert events the content of this file
27 is empty.
28
29What: /sys/class/pps/ppsX/clear
30Date: February 2008
31Contact: Rodolfo Giometti <giometti@linux.it>
32Description:
33 The /sys/class/pps/ppsX/clear file reports the clear events
34 and the clear sequence number of the X-th source in the form:
35
36 <secs>.<nsec>#<sequence>
37
38 If the source has no clear events the content of this file
39 is empty.
40
41What: /sys/class/pps/ppsX/mode
42Date: February 2008
43Contact: Rodolfo Giometti <giometti@linux.it>
44Description:
45 The /sys/class/pps/ppsX/mode file reports the functioning
46 mode of the X-th source in hexadecimal encoding.
47
48 Please, refer to linux/include/linux/pps.h for further
49 info.
50
51What: /sys/class/pps/ppsX/echo
52Date: February 2008
53Contact: Rodolfo Giometti <giometti@linux.it>
54Description:
55 The /sys/class/pps/ppsX/echo file reports if the X-th does
56 or does not support an "echo" function.
57
58What: /sys/class/pps/ppsX/name
59Date: February 2008
60Contact: Rodolfo Giometti <giometti@linux.it>
61Description:
62 The /sys/class/pps/ppsX/name file reports the name of the
63 X-th source.
64
65What: /sys/class/pps/ppsX/path
66Date: February 2008
67Contact: Rodolfo Giometti <giometti@linux.it>
68Description:
69 The /sys/class/pps/ppsX/path file reports the path name of
70 the device connected with the X-th source.
71
72 If the source is not connected with any device the content
73 of this file is empty.
diff --git a/Documentation/Changes b/Documentation/Changes
index 664392481c84..6d0f1efc5bf6 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -72,6 +72,13 @@ assembling the 16-bit boot code, removing the need for as86 to compile
72your kernel. This change does, however, mean that you need a recent 72your kernel. This change does, however, mean that you need a recent
73release of binutils. 73release of binutils.
74 74
75Perl
76----
77
78You will need perl 5 and the following modules: Getopt::Long, Getopt::Std,
79File::Basename, and File::Find to build the kernel.
80
81
75System utilities 82System utilities
76================ 83================
77 84
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 7f5f218015fe..08ff908aa7a2 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -106,7 +106,7 @@
106 number of errors are printk'ed including a full stack trace. 106 number of errors are printk'ed including a full stack trace.
107 </para> 107 </para>
108 <para> 108 <para>
109 The statistics are available via debugfs/debug_objects/stats. 109 The statistics are available via /sys/kernel/debug/debug_objects/stats.
110 They provide information about the number of warnings and the 110 They provide information about the number of warnings and the
111 number of successful fixups along with information about the 111 number of successful fixups along with information about the
112 usage of the internal tracking objects and the state of the 112 usage of the internal tracking objects and the state of the
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index a50d6cd58573..992e67e6be7f 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -449,8 +449,8 @@ printk(KERN_INFO "i = %u\n", i);
449 </para> 449 </para>
450 450
451 <programlisting> 451 <programlisting>
452__u32 ipaddress; 452__be32 ipaddress;
453printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress)); 453printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
454 </programlisting> 454 </programlisting>
455 455
456 <para> 456 <para>
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index e36986663570..f3f37f141dbd 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -184,8 +184,6 @@ usage should require reading the full document.
184!Finclude/net/mac80211.h ieee80211_ctstoself_get 184!Finclude/net/mac80211.h ieee80211_ctstoself_get
185!Finclude/net/mac80211.h ieee80211_ctstoself_duration 185!Finclude/net/mac80211.h ieee80211_ctstoself_duration
186!Finclude/net/mac80211.h ieee80211_generic_frame_duration 186!Finclude/net/mac80211.h ieee80211_generic_frame_duration
187!Finclude/net/mac80211.h ieee80211_get_hdrlen_from_skb
188!Finclude/net/mac80211.h ieee80211_hdrlen
189!Finclude/net/mac80211.h ieee80211_wake_queue 187!Finclude/net/mac80211.h ieee80211_wake_queue
190!Finclude/net/mac80211.h ieee80211_stop_queue 188!Finclude/net/mac80211.h ieee80211_stop_queue
191!Finclude/net/mac80211.h ieee80211_wake_queues 189!Finclude/net/mac80211.h ieee80211_wake_queues
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
index 8f6e3b2403c7..4d4ce0e61e42 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -25,6 +25,10 @@
25 <year>2006-2008</year> 25 <year>2006-2008</year>
26 <holder>Hans-Jürgen Koch.</holder> 26 <holder>Hans-Jürgen Koch.</holder>
27</copyright> 27</copyright>
28<copyright>
29 <year>2009</year>
30 <holder>Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)</holder>
31</copyright>
28 32
29<legalnotice> 33<legalnotice>
30<para> 34<para>
@@ -42,6 +46,13 @@ GPL version 2.
42 46
43<revhistory> 47<revhistory>
44 <revision> 48 <revision>
49 <revnumber>0.9</revnumber>
50 <date>2009-07-16</date>
51 <authorinitials>mst</authorinitials>
52 <revremark>Added generic pci driver
53 </revremark>
54 </revision>
55 <revision>
45 <revnumber>0.8</revnumber> 56 <revnumber>0.8</revnumber>
46 <date>2008-12-24</date> 57 <date>2008-12-24</date>
47 <authorinitials>hjk</authorinitials> 58 <authorinitials>hjk</authorinitials>
@@ -809,6 +820,158 @@ framework to set up sysfs files for this region. Simply leave it alone.
809 820
810</chapter> 821</chapter>
811 822
823<chapter id="uio_pci_generic" xreflabel="Using Generic driver for PCI cards">
824<?dbhtml filename="uio_pci_generic.html"?>
825<title>Generic PCI UIO driver</title>
826 <para>
827 The generic driver is a kernel module named uio_pci_generic.
828 It can work with any device compliant to PCI 2.3 (circa 2002) and
829 any compliant PCI Express device. Using this, you only need to
830 write the userspace driver, removing the need to write
831 a hardware-specific kernel module.
832 </para>
833
834<sect1 id="uio_pci_generic_binding">
835<title>Making the driver recognize the device</title>
836 <para>
837Since the driver does not declare any device ids, it will not get loaded
838automatically and will not automatically bind to any devices, you must load it
839and allocate id to the driver yourself. For example:
840 <programlisting>
841 modprobe uio_pci_generic
842 echo &quot;8086 10f5&quot; &gt; /sys/bus/pci/drivers/uio_pci_generic/new_id
843 </programlisting>
844 </para>
845 <para>
846If there already is a hardware specific kernel driver for your device, the
847generic driver still won't bind to it, in this case if you want to use the
848generic driver (why would you?) you'll have to manually unbind the hardware
849specific driver and bind the generic driver, like this:
850 <programlisting>
851 echo -n 0000:00:19.0 &gt; /sys/bus/pci/drivers/e1000e/unbind
852 echo -n 0000:00:19.0 &gt; /sys/bus/pci/drivers/uio_pci_generic/bind
853 </programlisting>
854 </para>
855 <para>
856You can verify that the device has been bound to the driver
857by looking for it in sysfs, for example like the following:
858 <programlisting>
859 ls -l /sys/bus/pci/devices/0000:00:19.0/driver
860 </programlisting>
861Which if successful should print
862 <programlisting>
863 .../0000:00:19.0/driver -&gt; ../../../bus/pci/drivers/uio_pci_generic
864 </programlisting>
865Note that the generic driver will not bind to old PCI 2.2 devices.
866If binding the device failed, run the following command:
867 <programlisting>
868 dmesg
869 </programlisting>
870and look in the output for failure reasons
871 </para>
872</sect1>
873
874<sect1 id="uio_pci_generic_internals">
875<title>Things to know about uio_pci_generic</title>
876 <para>
877Interrupts are handled using the Interrupt Disable bit in the PCI command
878register and Interrupt Status bit in the PCI status register. All devices
879compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices should
880support these bits. uio_pci_generic detects this support, and won't bind to
881devices which do not support the Interrupt Disable Bit in the command register.
882 </para>
883 <para>
884On each interrupt, uio_pci_generic sets the Interrupt Disable bit.
885This prevents the device from generating further interrupts
886until the bit is cleared. The userspace driver should clear this
887bit before blocking and waiting for more interrupts.
888 </para>
889</sect1>
890<sect1 id="uio_pci_generic_userspace">
891<title>Writing userspace driver using uio_pci_generic</title>
892 <para>
893Userspace driver can use pci sysfs interface, or the
894libpci libray that wraps it, to talk to the device and to
895re-enable interrupts by writing to the command register.
896 </para>
897</sect1>
898<sect1 id="uio_pci_generic_example">
899<title>Example code using uio_pci_generic</title>
900 <para>
901Here is some sample userspace driver code using uio_pci_generic:
902<programlisting>
903#include &lt;stdlib.h&gt;
904#include &lt;stdio.h&gt;
905#include &lt;unistd.h&gt;
906#include &lt;sys/types.h&gt;
907#include &lt;sys/stat.h&gt;
908#include &lt;fcntl.h&gt;
909#include &lt;errno.h&gt;
910
911int main()
912{
913 int uiofd;
914 int configfd;
915 int err;
916 int i;
917 unsigned icount;
918 unsigned char command_high;
919
920 uiofd = open(&quot;/dev/uio0&quot;, O_RDONLY);
921 if (uiofd &lt; 0) {
922 perror(&quot;uio open:&quot;);
923 return errno;
924 }
925 configfd = open(&quot;/sys/class/uio/uio0/device/config&quot;, O_RDWR);
926 if (uiofd &lt; 0) {
927 perror(&quot;config open:&quot;);
928 return errno;
929 }
930
931 /* Read and cache command value */
932 err = pread(configfd, &amp;command_high, 1, 5);
933 if (err != 1) {
934 perror(&quot;command config read:&quot;);
935 return errno;
936 }
937 command_high &amp;= ~0x4;
938
939 for(i = 0;; ++i) {
940 /* Print out a message, for debugging. */
941 if (i == 0)
942 fprintf(stderr, &quot;Started uio test driver.\n&quot;);
943 else
944 fprintf(stderr, &quot;Interrupts: %d\n&quot;, icount);
945
946 /****************************************/
947 /* Here we got an interrupt from the
948 device. Do something to it. */
949 /****************************************/
950
951 /* Re-enable interrupts. */
952 err = pwrite(configfd, &amp;command_high, 1, 5);
953 if (err != 1) {
954 perror(&quot;config write:&quot;);
955 break;
956 }
957
958 /* Wait for next interrupt. */
959 err = read(uiofd, &amp;icount, 4);
960 if (err != 4) {
961 perror(&quot;uio read:&quot;);
962 break;
963 }
964
965 }
966 return errno;
967}
968
969</programlisting>
970 </para>
971</sect1>
972
973</chapter>
974
812<appendix id="app1"> 975<appendix id="app1">
813<title>Further information</title> 976<title>Further information</title>
814<itemizedlist> 977<itemizedlist>
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
index 6650af432523..e83f2ea76415 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -4,15 +4,17 @@
4 February 2, 2006 4 February 2, 2006
5 5
6 Current document maintainer: 6 Current document maintainer:
7 Linas Vepstas <linas@austin.ibm.com> 7 Linas Vepstas <linasvepstas@gmail.com>
8 updated by Richard Lary <rlary@us.ibm.com>
9 and Mike Mason <mmlnx@us.ibm.com> on 27-Jul-2009
8 10
9 11
10Many PCI bus controllers are able to detect a variety of hardware 12Many PCI bus controllers are able to detect a variety of hardware
11PCI errors on the bus, such as parity errors on the data and address 13PCI errors on the bus, such as parity errors on the data and address
12busses, as well as SERR and PERR errors. Some of the more advanced 14busses, as well as SERR and PERR errors. Some of the more advanced
13chipsets are able to deal with these errors; these include PCI-E chipsets, 15chipsets are able to deal with these errors; these include PCI-E chipsets,
14and the PCI-host bridges found on IBM Power4 and Power5-based pSeries 16and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
15boxes. A typical action taken is to disconnect the affected device, 17pSeries boxes. A typical action taken is to disconnect the affected device,
16halting all I/O to it. The goal of a disconnection is to avoid system 18halting all I/O to it. The goal of a disconnection is to avoid system
17corruption; for example, to halt system memory corruption due to DMA's 19corruption; for example, to halt system memory corruption due to DMA's
18to "wild" addresses. Typically, a reconnection mechanism is also 20to "wild" addresses. Typically, a reconnection mechanism is also
@@ -37,10 +39,11 @@ is forced by the need to handle multi-function devices, that is,
37devices that have multiple device drivers associated with them. 39devices that have multiple device drivers associated with them.
38In the first stage, each driver is allowed to indicate what type 40In the first stage, each driver is allowed to indicate what type
39of reset it desires, the choices being a simple re-enabling of I/O 41of reset it desires, the choices being a simple re-enabling of I/O
40or requesting a hard reset (a full electrical #RST of the PCI card). 42or requesting a slot reset.
41If any driver requests a full reset, that is what will be done.
42 43
43After a full reset and/or a re-enabling of I/O, all drivers are 44If any driver requests a slot reset, that is what will be done.
45
46After a reset and/or a re-enabling of I/O, all drivers are
44again notified, so that they may then perform any device setup/config 47again notified, so that they may then perform any device setup/config
45that may be required. After these have all completed, a final 48that may be required. After these have all completed, a final
46"resume normal operations" event is sent out. 49"resume normal operations" event is sent out.
@@ -101,7 +104,7 @@ if it implements any, it must implement error_detected(). If a callback
101is not implemented, the corresponding feature is considered unsupported. 104is not implemented, the corresponding feature is considered unsupported.
102For example, if mmio_enabled() and resume() aren't there, then it 105For example, if mmio_enabled() and resume() aren't there, then it
103is assumed that the driver is not doing any direct recovery and requires 106is assumed that the driver is not doing any direct recovery and requires
104a reset. If link_reset() is not implemented, the card is assumed as 107a slot reset. If link_reset() is not implemented, the card is assumed to
105not care about link resets. Typically a driver will want to know about 108not care about link resets. Typically a driver will want to know about
106a slot_reset(). 109a slot_reset().
107 110
@@ -111,7 +114,7 @@ sequence described below.
111 114
112STEP 0: Error Event 115STEP 0: Error Event
113------------------- 116-------------------
114PCI bus error is detect by the PCI hardware. On powerpc, the slot 117A PCI bus error is detected by the PCI hardware. On powerpc, the slot
115is isolated, in that all I/O is blocked: all reads return 0xffffffff, 118is isolated, in that all I/O is blocked: all reads return 0xffffffff,
116all writes are ignored. 119all writes are ignored.
117 120
@@ -139,7 +142,7 @@ The driver must return one of the following result codes:
139 a chance to extract some diagnostic information (see 142 a chance to extract some diagnostic information (see
140 mmio_enable, below). 143 mmio_enable, below).
141 - PCI_ERS_RESULT_NEED_RESET: 144 - PCI_ERS_RESULT_NEED_RESET:
142 Driver returns this if it can't recover without a hard 145 Driver returns this if it can't recover without a
143 slot reset. 146 slot reset.
144 - PCI_ERS_RESULT_DISCONNECT: 147 - PCI_ERS_RESULT_DISCONNECT:
145 Driver returns this if it doesn't want to recover at all. 148 Driver returns this if it doesn't want to recover at all.
@@ -169,11 +172,11 @@ is STEP 6 (Permanent Failure).
169 172
170>>> The current powerpc implementation doesn't much care if the device 173>>> The current powerpc implementation doesn't much care if the device
171>>> attempts I/O at this point, or not. I/O's will fail, returning 174>>> attempts I/O at this point, or not. I/O's will fail, returning
172>>> a value of 0xff on read, and writes will be dropped. If the device 175>>> a value of 0xff on read, and writes will be dropped. If more than
173>>> driver attempts more than 10K I/O's to a frozen adapter, it will 176>>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
174>>> assume that the device driver has gone into an infinite loop, and 177>>> assumes that the device driver has gone into an infinite loop
175>>> it will panic the kernel. There doesn't seem to be any other 178>>> and prints an error to syslog. A reboot is then required to
176>>> way of stopping a device driver that insists on spinning on I/O. 179>>> get the device working again.
177 180
178STEP 2: MMIO Enabled 181STEP 2: MMIO Enabled
179------------------- 182-------------------
@@ -182,15 +185,14 @@ DMA), and then calls the mmio_enabled() callback on all affected
182device drivers. 185device drivers.
183 186
184This is the "early recovery" call. IOs are allowed again, but DMA is 187This is the "early recovery" call. IOs are allowed again, but DMA is
185not (hrm... to be discussed, I prefer not), with some restrictions. This 188not, with some restrictions. This is NOT a callback for the driver to
186is NOT a callback for the driver to start operations again, only to 189start operations again, only to peek/poke at the device, extract diagnostic
187peek/poke at the device, extract diagnostic information, if any, and 190information, if any, and eventually do things like trigger a device local
188eventually do things like trigger a device local reset or some such, 191reset or some such, but not restart operations. This callback is made if
189but not restart operations. This is callback is made if all drivers on 192all drivers on a segment agree that they can try to recover and if no automatic
190a segment agree that they can try to recover and if no automatic link reset 193link reset was performed by the HW. If the platform can't just re-enable IOs
191was performed by the HW. If the platform can't just re-enable IOs without 194without a slot reset or a link reset, it will not call this callback, and
192a slot reset or a link reset, it wont call this callback, and instead 195instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
193will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
194 196
195>>> The following is proposed; no platform implements this yet: 197>>> The following is proposed; no platform implements this yet:
196>>> Proposal: All I/O's should be done _synchronously_ from within 198>>> Proposal: All I/O's should be done _synchronously_ from within
@@ -228,9 +230,6 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
228If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform 230If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
229proceeds to STEP 4 (Slot Reset) 231proceeds to STEP 4 (Slot Reset)
230 232
231>>> The current powerpc implementation does not implement this callback.
232
233
234STEP 3: Link Reset 233STEP 3: Link Reset
235------------------ 234------------------
236The platform resets the link, and then calls the link_reset() callback 235The platform resets the link, and then calls the link_reset() callback
@@ -253,16 +252,33 @@ The platform then proceeds to either STEP 4 (Slot Reset) or STEP 5
253 252
254>>> The current powerpc implementation does not implement this callback. 253>>> The current powerpc implementation does not implement this callback.
255 254
256
257STEP 4: Slot Reset 255STEP 4: Slot Reset
258------------------ 256------------------
259The platform performs a soft or hard reset of the device, and then
260calls the slot_reset() callback.
261 257
262A soft reset consists of asserting the adapter #RST line and then 258In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
259the platform will peform a slot reset on the requesting PCI device(s).
260The actual steps taken by a platform to perform a slot reset
261will be platform-dependent. Upon completion of slot reset, the
262platform will call the device slot_reset() callback.
263
264Powerpc platforms implement two levels of slot reset:
265soft reset(default) and fundamental(optional) reset.
266
267Powerpc soft reset consists of asserting the adapter #RST line and then
263restoring the PCI BAR's and PCI configuration header to a state 268restoring the PCI BAR's and PCI configuration header to a state
264that is equivalent to what it would be after a fresh system 269that is equivalent to what it would be after a fresh system
265power-on followed by power-on BIOS/system firmware initialization. 270power-on followed by power-on BIOS/system firmware initialization.
271Soft reset is also known as hot-reset.
272
273Powerpc fundamental reset is supported by PCI Express cards only
274and results in device's state machines, hardware logic, port states and
275configuration registers to initialize to their default conditions.
276
277For most PCI devices, a soft reset will be sufficient for recovery.
278Optional fundamental reset is provided to support a limited number
279of PCI Express PCI devices for which a soft reset is not sufficient
280for recovery.
281
266If the platform supports PCI hotplug, then the reset might be 282If the platform supports PCI hotplug, then the reset might be
267performed by toggling the slot electrical power off/on. 283performed by toggling the slot electrical power off/on.
268 284
@@ -274,10 +290,12 @@ may result in hung devices, kernel panics, or silent data corruption.
274 290
275This call gives drivers the chance to re-initialize the hardware 291This call gives drivers the chance to re-initialize the hardware
276(re-download firmware, etc.). At this point, the driver may assume 292(re-download firmware, etc.). At this point, the driver may assume
277that he card is in a fresh state and is fully functional. In 293that the card is in a fresh state and is fully functional. The slot
278particular, interrupt generation should work normally. 294is unfrozen and the driver has full access to PCI config space,
295memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X)
296will also be available.
279 297
280Drivers should not yet restart normal I/O processing operations 298Drivers should not restart normal I/O processing operations
281at this point. If all device drivers report success on this 299at this point. If all device drivers report success on this
282callback, the platform will call resume() to complete the sequence, 300callback, the platform will call resume() to complete the sequence,
283and let the driver restart normal I/O processing. 301and let the driver restart normal I/O processing.
@@ -302,11 +320,21 @@ driver performs device init only from PCI function 0:
302 - PCI_ERS_RESULT_DISCONNECT 320 - PCI_ERS_RESULT_DISCONNECT
303 Same as above. 321 Same as above.
304 322
323Drivers for PCI Express cards that require a fundamental reset must
324set the needs_freset bit in the pci_dev structure in their probe function.
325For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
326PCI card types:
327
328+ /* Set EEH reset type to fundamental if required by hba */
329+ if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha))
330+ pdev->needs_freset = 1;
331+
332
305Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent 333Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
306Failure). 334Failure).
307 335
308>>> The current powerpc implementation does not currently try a 336>>> The current powerpc implementation does not try a power-cycle
309>>> power-cycle reset if the driver returned PCI_ERS_RESULT_DISCONNECT. 337>>> reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
310>>> However, it probably should. 338>>> However, it probably should.
311 339
312 340
@@ -348,7 +376,7 @@ software errors.
348 376
349Conclusion; General Remarks 377Conclusion; General Remarks
350--------------------------- 378---------------------------
351The way those callbacks are called is platform policy. A platform with 379The way the callbacks are called is platform policy. A platform with
352no slot reset capability may want to just "ignore" drivers that can't 380no slot reset capability may want to just "ignore" drivers that can't
353recover (disconnect them) and try to let other cards on the same segment 381recover (disconnect them) and try to let other cards on the same segment
354recover. Keep in mind that in most real life cases, though, there will 382recover. Keep in mind that in most real life cases, though, there will
@@ -361,8 +389,8 @@ That is, the recovery API only requires that:
361 389
362 - There is no guarantee that interrupt delivery can proceed from any 390 - There is no guarantee that interrupt delivery can proceed from any
363device on the segment starting from the error detection and until the 391device on the segment starting from the error detection and until the
364resume callback is sent, at which point interrupts are expected to be 392slot_reset callback is called, at which point interrupts are expected
365fully operational. 393to be fully operational.
366 394
367 - There is no guarantee that interrupt delivery is stopped, that is, 395 - There is no guarantee that interrupt delivery is stopped, that is,
368a driver that gets an interrupt after detecting an error, or that detects 396a driver that gets an interrupt after detecting an error, or that detects
@@ -381,16 +409,23 @@ anyway :)
381>>> Implementation details for the powerpc platform are discussed in 409>>> Implementation details for the powerpc platform are discussed in
382>>> the file Documentation/powerpc/eeh-pci-error-recovery.txt 410>>> the file Documentation/powerpc/eeh-pci-error-recovery.txt
383 411
384>>> As of this writing, there are six device drivers with patches 412>>> As of this writing, there is a growing list of device drivers with
385>>> implementing error recovery. Not all of these patches are in 413>>> patches implementing error recovery. Not all of these patches are in
386>>> mainline yet. These may be used as "examples": 414>>> mainline yet. These may be used as "examples":
387>>> 415>>>
388>>> drivers/scsi/ipr.c 416>>> drivers/scsi/ipr
389>>> drivers/scsi/sym53cxx_2 417>>> drivers/scsi/sym53c8xx_2
418>>> drivers/scsi/qla2xxx
419>>> drivers/scsi/lpfc
420>>> drivers/next/bnx2.c
390>>> drivers/next/e100.c 421>>> drivers/next/e100.c
391>>> drivers/net/e1000 422>>> drivers/net/e1000
423>>> drivers/net/e1000e
392>>> drivers/net/ixgb 424>>> drivers/net/ixgb
425>>> drivers/net/ixgbe
426>>> drivers/net/cxgb3
393>>> drivers/net/s2io.c 427>>> drivers/net/s2io.c
428>>> drivers/net/qlge
394 429
395The End 430The End
396------- 431-------
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index ddeb14beacc8..be21001ab144 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -61,6 +61,10 @@ be initiated although firmwares have no _OSC support. To enable the
61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line 61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line
62when booting kernel. Note that forceload=n by default. 62when booting kernel. Note that forceload=n by default.
63 63
64nosourceid, another parameter of type bool, can be used when broken
65hardware (mostly chipsets) has root ports that cannot obtain the reporting
66source ID. nosourceid=n by default.
67
642.3 AER error output 682.3 AER error output
65When a PCI-E AER error is captured, an error message will be outputed to 69When a PCI-E AER error is captured, an error message will be outputed to
66console. If it's a correctable error, it is outputed as a warning. 70console. If it's a correctable error, it is outputed as a warning.
@@ -246,3 +250,24 @@ with the PCI Express AER Root driver?
246A: It could call the helper functions to enable AER in devices and 250A: It could call the helper functions to enable AER in devices and
247cleanup uncorrectable status register. Pls. refer to section 3.3. 251cleanup uncorrectable status register. Pls. refer to section 3.3.
248 252
253
2544. Software error injection
255
256Debugging PCIE AER error recovery code is quite difficult because it
257is hard to trigger real hardware errors. Software based error
258injection can be used to fake various kinds of PCIE errors.
259
260First you should enable PCIE AER software error injection in kernel
261configuration, that is, following item should be in your .config.
262
263CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
264
265After reboot with new kernel or insert the module, a device file named
266/dev/aer_inject should be created.
267
268Then, you need a user space tool named aer-inject, which can be gotten
269from:
270 http://www.kernel.org/pub/linux/utils/pci/aer-inject/
271
272More information about aer-inject can be found in the document comes
273with its source code.
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 9f711d2df91b..d2b85237c76e 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -743,3 +743,80 @@ Revised:
743 RCU, realtime RCU, sleepable RCU, performance. 743 RCU, realtime RCU, sleepable RCU, performance.
744" 744"
745} 745}
746
747@article{PaulEMcKenney2008RCUOSR
748,author="Paul E. McKenney and Jonathan Walpole"
749,title="Introducing technology into the {Linux} kernel: a case study"
750,Year="2008"
751,journal="SIGOPS Oper. Syst. Rev."
752,volume="42"
753,number="5"
754,pages="4--17"
755,issn="0163-5980"
756,doi={http://doi.acm.org/10.1145/1400097.1400099}
757,publisher="ACM"
758,address="New York, NY, USA"
759,annotation={
760 Linux changed RCU to a far greater degree than RCU has changed Linux.
761}
762}
763
764@unpublished{PaulEMcKenney2008HierarchicalRCU
765,Author="Paul E. McKenney"
766,Title="Hierarchical {RCU}"
767,month="November"
768,day="3"
769,year="2008"
770,note="Available:
771\url{http://lwn.net/Articles/305782/}
772[Viewed November 6, 2008]"
773,annotation="
774 RCU with combining-tree-based grace-period detection,
775 permitting it to handle thousands of CPUs.
776"
777}
778
779@conference{PaulEMcKenney2009MaliciousURCU
780,Author="Paul E. McKenney"
781,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms"
782,Booktitle="linux.conf.au 2009"
783,month="January"
784,year="2009"
785,address="Hobart, Australia"
786,note="Available:
787\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf}
788[Viewed February 2, 2009]"
789,annotation="
790 Realtime RCU and torture-testing RCU uses.
791"
792}
793
794@unpublished{MathieuDesnoyers2009URCU
795,Author="Mathieu Desnoyers"
796,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}"
797,month="February"
798,day="5"
799,year="2009"
800,note="Available:
801\url{http://lkml.org/lkml/2009/2/5/572}
802\url{git://lttng.org/userspace-rcu.git}
803[Viewed February 20, 2009]"
804,annotation="
805 Mathieu Desnoyers's user-space RCU implementation.
806 git://lttng.org/userspace-rcu.git
807"
808}
809
810@unpublished{PaulEMcKenney2009BloatWatchRCU
811,Author="Paul E. McKenney"
812,Title="{RCU}: The {Bloatwatch} Edition"
813,month="March"
814,day="17"
815,year="2009"
816,note="Available:
817\url{http://lwn.net/Articles/323929/}
818[Viewed March 20, 2009]"
819,annotation="
820 Uniprocessor assumptions allow simplified RCU implementation.
821"
822}
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt
index aab4a9ec3931..90ec5341ee98 100644
--- a/Documentation/RCU/UP.txt
+++ b/Documentation/RCU/UP.txt
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems
2 2
3 3
4A common misconception is that, on UP systems, the call_rcu() primitive 4A common misconception is that, on UP systems, the call_rcu() primitive
5may immediately invoke its function, and that the synchronize_rcu() 5may immediately invoke its function. The basis of this misconception
6primitive may return immediately. The basis of this misconception
7is that since there is only one CPU, it should not be necessary to 6is that since there is only one CPU, it should not be necessary to
8wait for anything else to get done, since there are no other CPUs for 7wait for anything else to get done, since there are no other CPUs for
9anything else to be happening on. Although this approach will -sort- -of- 8anything else to be happening on. Although this approach will -sort- -of-
10work a surprising amount of the time, it is a very bad idea in general. 9work a surprising amount of the time, it is a very bad idea in general.
11This document presents three examples that demonstrate exactly how bad an 10This document presents three examples that demonstrate exactly how bad
12idea this is. 11an idea this is.
13 12
14 13
15Example 1: softirq Suicide 14Example 1: softirq Suicide
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect?
82 81
83Summary 82Summary
84 83
85Permitting call_rcu() to immediately invoke its arguments or permitting 84Permitting call_rcu() to immediately invoke its arguments breaks RCU,
86synchronize_rcu() to immediately return breaks RCU, even on a UP system. 85even on a UP system. So do not do it! Even on a UP system, the RCU
87So do not do it! Even on a UP system, the RCU infrastructure -must- 86infrastructure -must- respect grace periods, and -must- invoke callbacks
88respect grace periods, and -must- invoke callbacks from a known environment 87from a known environment in which no locks are held.
89in which no locks are held. 88
89It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
90immediately on an UP system. It is also safe for synchronize_rcu()
91to return immediately on UP systems, except when running preemptable
92RCU.
93
94Quick Quiz #3: Why can't synchronize_rcu() return immediately on
95 UP systems running preemptable RCU?
90 96
91 97
92Answer to Quick Quiz #1: 98Answer to Quick Quiz #1:
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2:
117 callbacks acquire locks directly. However, a great many RCU 123 callbacks acquire locks directly. However, a great many RCU
118 callbacks do acquire locks -indirectly-, for example, via 124 callbacks do acquire locks -indirectly-, for example, via
119 the kfree() primitive. 125 the kfree() primitive.
126
127Answer to Quick Quiz #3:
128 Why can't synchronize_rcu() return immediately on UP systems
129 running preemptable RCU?
130
131 Because some other task might have been preempted in the middle
132 of an RCU read-side critical section. If synchronize_rcu()
133 simply immediately returned, it would prematurely signal the
134 end of the grace period, which would come as a nasty shock to
135 that other thread when it started running again.
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index accfe2f5247d..51525a30e8b4 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome!
11 structure is updated more than about 10% of the time, then 11 structure is updated more than about 10% of the time, then
12 you should strongly consider some other approach, unless 12 you should strongly consider some other approach, unless
13 detailed performance measurements show that RCU is nonetheless 13 detailed performance measurements show that RCU is nonetheless
14 the right tool for the job. 14 the right tool for the job. Yes, you might think of RCU
15 as simply cutting overhead off of the readers and imposing it
16 on the writers. That is exactly why normal uses of RCU will
17 do much more reading than updating.
15 18
16 Another exception is where performance is not an issue, and RCU 19 Another exception is where performance is not an issue, and RCU
17 provides a simpler implementation. An example of this situation 20 provides a simpler implementation. An example of this situation
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome!
240 instead need to use synchronize_irq() or synchronize_sched(). 243 instead need to use synchronize_irq() or synchronize_sched().
241 244
24212. Any lock acquired by an RCU callback must be acquired elsewhere 24512. Any lock acquired by an RCU callback must be acquired elsewhere
243 with irq disabled, e.g., via spin_lock_irqsave(). Failing to 246 with softirq disabled, e.g., via spin_lock_irqsave(),
244 disable irq on a given acquisition of that lock will result in 247 spin_lock_bh(), etc. Failing to disable irq on a given
245 deadlock as soon as the RCU callback happens to interrupt that 248 acquisition of that lock will result in deadlock as soon as the
246 acquisition's critical section. 249 RCU callback happens to interrupt that acquisition's critical
250 section.
247 251
24813. RCU callbacks can be and are executed in parallel. In many cases, 25213. RCU callbacks can be and are executed in parallel. In many cases,
249 the callback code simply wrappers around kfree(), so that this 253 the callback code simply wrappers around kfree(), so that this
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome!
310 Because these primitives only wait for pre-existing readers, 314 Because these primitives only wait for pre-existing readers,
311 it is the caller's responsibility to guarantee safety to 315 it is the caller's responsibility to guarantee safety to
312 any subsequent readers. 316 any subsequent readers.
317
31816. The various RCU read-side primitives do -not- contain memory
319 barriers. The CPU (and in some cases, the compiler) is free
320 to reorder code into and out of RCU read-side critical sections.
321 It is the responsibility of the RCU update-side primitives to
322 deal with this.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 7aa2002ade77..2a23523ce471 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed
36 executed in user mode, or executed in the idle loop, we can 36 executed in user mode, or executed in the idle loop, we can
37 safely free up that item. 37 safely free up that item.
38 38
39 Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking
42 within RCU read-side critical sections. SRCU also uses 42 within RCU read-side critical sections. SRCU also uses
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that?
79o I hear that RCU needs work in order to support realtime kernels? 79o I hear that RCU needs work in order to support realtime kernels?
80 80
81 This work is largely completed. Realtime-friendly RCU can be 81 This work is largely completed. Realtime-friendly RCU can be
82 enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. 82 enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
83 However, work is in progress for enabling priority boosting of 83 parameter. However, work is in progress for enabling priority
84 preempted RCU read-side critical sections. This is needed if you 84 boosting of preempted RCU read-side critical sections. This is
85 have CPU-bound realtime threads. 85 needed if you have CPU-bound realtime threads.
86 86
87o Where can I find more information on RCU? 87o Where can I find more information on RCU?
88 88
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 909602d409bb..e439a0edee22 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all
170the timers, and only then invoke rcu_barrier() to wait for any remaining 170the timers, and only then invoke rcu_barrier() to wait for any remaining
171RCU callbacks to complete. 171RCU callbacks to complete.
172 172
173Of course, if you module uses call_rcu_bh(), you will need to invoke
174rcu_barrier_bh() before unloading. Similarly, if your module uses
175call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
176unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
177call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
178rcu_barrier_bh(), and rcu_barrier_sched().
179
173 180
174Implementing rcu_barrier() 181Implementing rcu_barrier()
175 182
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 93cb28d05dcd..18f9651ff23d 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -83,11 +83,12 @@ not detect it missed following items in original chain.
83obj = kmem_cache_alloc(...); 83obj = kmem_cache_alloc(...);
84lock_chain(); // typically a spin_lock() 84lock_chain(); // typically a spin_lock()
85obj->key = key; 85obj->key = key;
86atomic_inc(&obj->refcnt);
87/* 86/*
88 * we need to make sure obj->key is updated before obj->next 87 * we need to make sure obj->key is updated before obj->next
88 * or obj->refcnt
89 */ 89 */
90smp_wmb(); 90smp_wmb();
91atomic_set(&obj->refcnt, 1);
91hlist_add_head_rcu(&obj->obj_node, list); 92hlist_add_head_rcu(&obj->obj_node, list);
92unlock_chain(); // typically a spin_unlock() 93unlock_chain(); // typically a spin_unlock()
93 94
@@ -159,6 +160,10 @@ out:
159obj = kmem_cache_alloc(cachep); 160obj = kmem_cache_alloc(cachep);
160lock_chain(); // typically a spin_lock() 161lock_chain(); // typically a spin_lock()
161obj->key = key; 162obj->key = key;
163/*
164 * changes to obj->key must be visible before refcnt one
165 */
166smp_wmb();
162atomic_set(&obj->refcnt, 1); 167atomic_set(&obj->refcnt, 1);
163/* 168/*
164 * insert obj in RCU way (readers might be traversing chain) 169 * insert obj in RCU way (readers might be traversing chain)
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index a342b6e1cc10..9dba3bb90e60 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
76 "rcu_sync" for rcu_read_lock() with synchronous reclamation, 76 "rcu_sync" for rcu_read_lock() with synchronous reclamation,
77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for 77 "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for 78 rcu_read_lock_bh() with synchronous reclamation, "srcu" for
79 the "srcu_read_lock()" API, and "sched" for the use of 79 the "srcu_read_lock()" API, "sched" for the use of
80 preempt_disable() together with synchronize_sched(). 80 preempt_disable() together with synchronize_sched(),
81 and "sched_expedited" for the use of preempt_disable()
82 with synchronize_sched_expedited().
81 83
82verbose Enable debug printk()s. Default is disabled. 84verbose Enable debug printk()s. Default is disabled.
83 85
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The
162"idx" value maps the "old" and "current" values to the underlying array, 164"idx" value maps the "old" and "current" values to the underlying array,
163and is useful for debugging. 165and is useful for debugging.
164 166
167Similarly, sched_expedited RCU provides the following:
168
169 sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
170 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
171 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
172 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
173 state: -1 / 0:0 3:0 4:0
174
175As before, the first four lines are similar to those for RCU.
176The last line shows the task-migration state. The first number is
177-1 if synchronize_sched_expedited() is idle, -2 if in the process of
178posting wakeups to the migration kthreads, and N when waiting on CPU N.
179Each of the colon-separated fields following the "/" is a CPU:state pair.
180Valid states are "0" for idle, "1" for waiting for quiescent state,
181"2" for passed through quiescent state, and "3" when a race with a
182CPU-hotplug event forces use of the synchronize_sched() primitive.
183
165 184
166USAGE 185USAGE
167 186
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 02cced183b2d..187bbf10c923 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
191 191
192The output of "cat rcu/rcudata" looks as follows: 192The output of "cat rcu/rcudata" looks as follows:
193 193
194rcu: 194rcu_sched:
195rcu:
196 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 195 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
197 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 196 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
198 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 197 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format.
306 305
307The output of "cat rcu/rcugp" looks as follows: 306The output of "cat rcu/rcugp" looks as follows:
308 307
309rcu: completed=33062 gpnum=33063 308rcu_sched: completed=33062 gpnum=33063
310rcu_bh: completed=464 gpnum=464 309rcu_bh: completed=464 gpnum=464
311 310
312Again, this output is for both "rcu" and "rcu_bh". The fields are 311Again, this output is for both "rcu" and "rcu_bh". The fields are
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
413 412
414The output of "cat rcu/rcu_pending" looks as follows: 413The output of "cat rcu/rcu_pending" looks as follows:
415 414
416rcu: 415rcu_sched:
417 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 416 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
418 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 417 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
419 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 418 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 96170824a717..e41a7fecf0d3 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -136,10 +136,10 @@ rcu_read_lock()
136 Used by a reader to inform the reclaimer that the reader is 136 Used by a reader to inform the reclaimer that the reader is
137 entering an RCU read-side critical section. It is illegal 137 entering an RCU read-side critical section. It is illegal
138 to block while in an RCU read-side critical section, though 138 to block while in an RCU read-side critical section, though
139 kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side 139 kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
140 critical sections. Any RCU-protected data structure accessed 140 read-side critical sections. Any RCU-protected data structure
141 during an RCU read-side critical section is guaranteed to remain 141 accessed during an RCU read-side critical section is guaranteed to
142 unreclaimed for the full duration of that critical section. 142 remain unreclaimed for the full duration of that critical section.
143 Reference counts may be used in conjunction with RCU to maintain 143 Reference counts may be used in conjunction with RCU to maintain
144 longer-term references to data structures. 144 longer-term references to data structures.
145 145
@@ -785,6 +785,7 @@ RCU pointer/list traversal:
785 rcu_dereference 785 rcu_dereference
786 list_for_each_entry_rcu 786 list_for_each_entry_rcu
787 hlist_for_each_entry_rcu 787 hlist_for_each_entry_rcu
788 hlist_nulls_for_each_entry_rcu
788 789
789 list_for_each_continue_rcu (to be deprecated in favor of new 790 list_for_each_continue_rcu (to be deprecated in favor of new
790 list_for_each_entry_continue_rcu) 791 list_for_each_entry_continue_rcu)
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier
807 808
808 rcu_read_lock synchronize_net rcu_barrier 809 rcu_read_lock synchronize_net rcu_barrier
809 rcu_read_unlock synchronize_rcu 810 rcu_read_unlock synchronize_rcu
811 synchronize_rcu_expedited
810 call_rcu 812 call_rcu
811 813
812 814
813bh: Critical sections Grace period Barrier 815bh: Critical sections Grace period Barrier
814 816
815 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh 817 rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
816 rcu_read_unlock_bh 818 rcu_read_unlock_bh synchronize_rcu_bh
819 synchronize_rcu_bh_expedited
817 820
818 821
819sched: Critical sections Grace period Barrier 822sched: Critical sections Grace period Barrier
820 823
821 [preempt_disable] synchronize_sched rcu_barrier_sched 824 rcu_read_lock_sched synchronize_sched rcu_barrier_sched
822 [and friends] call_rcu_sched 825 rcu_read_unlock_sched call_rcu_sched
826 [preempt_disable] synchronize_sched_expedited
827 [and friends]
823 828
824 829
825SRCU: Critical sections Grace period Barrier 830SRCU: Critical sections Grace period Barrier
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier
827 srcu_read_lock synchronize_srcu N/A 832 srcu_read_lock synchronize_srcu N/A
828 srcu_read_unlock 833 srcu_read_unlock
829 834
835SRCU: Initialization/cleanup
836 init_srcu_struct
837 cleanup_srcu_struct
830 838
831See the comment headers in the source code (or the docbook generated 839See the comment headers in the source code (or the docbook generated
832from them) for more information. 840from them) for more information.
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index ac5e0b2f1097..78a9168ff377 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -54,7 +54,7 @@ kernel patches.
54 CONFIG_PREEMPT. 54 CONFIG_PREEMPT.
55 55
5614: If the patch affects IO/Disk, etc: has been tested with and without 5614: If the patch affects IO/Disk, etc: has been tested with and without
57 CONFIG_LBD. 57 CONFIG_LBDAF.
58 58
5915: All codepaths have been exercised with all lockdep features enabled. 5915: All codepaths have been exercised with all lockdep features enabled.
60 60
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 7ea231172c85..aa73e72fd793 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -246,7 +246,8 @@ void print_ioacct(struct taskstats *t)
246 246
247int main(int argc, char *argv[]) 247int main(int argc, char *argv[])
248{ 248{
249 int c, rc, rep_len, aggr_len, len2, cmd_type; 249 int c, rc, rep_len, aggr_len, len2;
250 int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
250 __u16 id; 251 __u16 id;
251 __u32 mypid; 252 __u32 mypid;
252 253
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
index ab47c3833908..7197a9e958ee 100644
--- a/Documentation/arm/SA1100/ADSBitsy
+++ b/Documentation/arm/SA1100/ADSBitsy
@@ -40,4 +40,4 @@ Notes:
40 mode, the timing is off so the image is corrupted. This will be 40 mode, the timing is off so the image is corrupted. This will be
41 fixed soon. 41 fixed soon.
42 42
43Any contribution can be sent to nico@cam.org and will be greatly welcome! 43Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
index 78bc1c1b04e5..91f7ce7ba426 100644
--- a/Documentation/arm/SA1100/Assabet
+++ b/Documentation/arm/SA1100/Assabet
@@ -240,7 +240,7 @@ Then, rebooting the Assabet is just a matter of waiting for the login prompt.
240 240
241 241
242Nicolas Pitre 242Nicolas Pitre
243nico@cam.org 243nico@fluxnic.net
244June 12, 2001 244June 12, 2001
245 245
246 246
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
index 2254c8f0b326..b1cfd405dccc 100644
--- a/Documentation/arm/SA1100/Brutus
+++ b/Documentation/arm/SA1100/Brutus
@@ -60,7 +60,7 @@ little modifications.
60 60
61Any contribution is welcome. 61Any contribution is welcome.
62 62
63Please send patches to nico@cam.org 63Please send patches to nico@fluxnic.net
64 64
65Have Fun ! 65Have Fun !
66 66
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
index 8fa7e8027ff1..6c9c4f5a36e1 100644
--- a/Documentation/arm/SA1100/GraphicsClient
+++ b/Documentation/arm/SA1100/GraphicsClient
@@ -4,7 +4,7 @@ For more details, contact Applied Data Systems or see
4http://www.applieddata.net/products.html 4http://www.applieddata.net/products.html
5 5
6The original Linux support for this product has been provided by 6The original Linux support for this product has been provided by
7Nicolas Pitre <nico@cam.org>. Continued development work by 7Nicolas Pitre <nico@fluxnic.net>. Continued development work by
8Woojung Huh <whuh@applieddata.net> 8Woojung Huh <whuh@applieddata.net>
9 9
10It's currently possible to mount a root filesystem via NFS providing a 10It's currently possible to mount a root filesystem via NFS providing a
@@ -94,5 +94,5 @@ Notes:
94 mode, the timing is off so the image is corrupted. This will be 94 mode, the timing is off so the image is corrupted. This will be
95 fixed soon. 95 fixed soon.
96 96
97Any contribution can be sent to nico@cam.org and will be greatly welcome! 97Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
98 98
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
index dd28745ac521..ee7c6595f23f 100644
--- a/Documentation/arm/SA1100/GraphicsMaster
+++ b/Documentation/arm/SA1100/GraphicsMaster
@@ -4,7 +4,7 @@ For more details, contact Applied Data Systems or see
4http://www.applieddata.net/products.html 4http://www.applieddata.net/products.html
5 5
6The original Linux support for this product has been provided by 6The original Linux support for this product has been provided by
7Nicolas Pitre <nico@cam.org>. Continued development work by 7Nicolas Pitre <nico@fluxnic.net>. Continued development work by
8Woojung Huh <whuh@applieddata.net> 8Woojung Huh <whuh@applieddata.net>
9 9
10Use 'make graphicsmaster_config' before any 'make config'. 10Use 'make graphicsmaster_config' before any 'make config'.
@@ -50,4 +50,4 @@ Notes:
50 mode, the timing is off so the image is corrupted. This will be 50 mode, the timing is off so the image is corrupted. This will be
51 fixed soon. 51 fixed soon.
52 52
53Any contribution can be sent to nico@cam.org and will be greatly welcome! 53Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Victor b/Documentation/arm/SA1100/Victor
index 01e81fc49461..f938a29fdc20 100644
--- a/Documentation/arm/SA1100/Victor
+++ b/Documentation/arm/SA1100/Victor
@@ -9,7 +9,7 @@ Of course Victor is using Linux as its main operating system.
9The Victor implementation for Linux is maintained by Nicolas Pitre: 9The Victor implementation for Linux is maintained by Nicolas Pitre:
10 10
11 nico@visuaide.com 11 nico@visuaide.com
12 nico@cam.org 12 nico@fluxnic.net
13 13
14For any comments, please feel free to contact me through the above 14For any comments, please feel free to contact me through the above
15addresses. 15addresses.
diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
new file mode 100644
index 000000000000..76b3a11e90be
--- /dev/null
+++ b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
@@ -0,0 +1,75 @@
1 S3C24XX CPUfreq support
2 =======================
3
4Introduction
5------------
6
7 The S3C24XX series support a number of power saving systems, such as
8 the ability to change the core, memory and peripheral operating
9 frequencies. The core control is exported via the CPUFreq driver
10 which has a number of different manual or automatic controls over the
11 rate the core is running at.
12
13 There are two forms of the driver depending on the specific CPU and
14 how the clocks are arranged. The first implementation used as single
15 PLL to feed the ARM, memory and peripherals via a series of dividers
16 and muxes and this is the implementation that is documented here. A
17 newer version where there is a seperate PLL and clock divider for the
18 ARM core is available as a seperate driver.
19
20
21Layout
22------
23
24 The code core manages the CPU specific drivers, any data that they
25 need to register and the interface to the generic drivers/cpufreq
26 system. Each CPU registers a driver to control the PLL, clock dividers
27 and anything else associated with it. Any board that wants to use this
28 framework needs to supply at least basic details of what is required.
29
30 The core registers with drivers/cpufreq at init time if all the data
31 necessary has been supplied.
32
33
34CPU support
35-----------
36
37 The support for each CPU depends on the facilities provided by the
38 SoC and the driver as each device has different PLL and clock chains
39 associated with it.
40
41
42Slow Mode
43---------
44
45 The SLOW mode where the PLL is turned off altogether and the
46 system is fed by the external crystal input is currently not
47 supported.
48
49
50sysfs
51-----
52
53 The core code exports extra information via sysfs in the directory
54 devices/system/cpu/cpu0/arch-freq.
55
56
57Board Support
58-------------
59
60 Each board that wants to use the cpufreq code must register some basic
61 information with the core driver to provide information about what the
62 board requires and any restrictions being placed on it.
63
64 The board needs to supply information about whether it needs the IO bank
65 timings changing, any maximum frequency limits and information about the
66 SDRAM refresh rate.
67
68
69
70
71Document Author
72---------------
73
74Ben Dooks, Copyright 2009 Simtec Electronics
75Licensed under GPLv2
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index 43cb1004d35f..9d58c7c5eddd 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -21,6 +21,8 @@ ffff8000 ffffffff copy_user_page / clear_user_page use.
21 For SA11xx and Xscale, this is used to 21 For SA11xx and Xscale, this is used to
22 setup a minicache mapping. 22 setup a minicache mapping.
23 23
24ffff4000 ffffffff cache aliasing on ARMv6 and later CPUs.
25
24ffff1000 ffff7fff Reserved. 26ffff1000 ffff7fff Reserved.
25 Platforms must not use this address range. 27 Platforms must not use this address range.
26 28
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 4ef245010457..396bec3b74ed 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -229,10 +229,10 @@ kernel. It is the use of atomic counters to implement reference
229counting, and it works such that once the counter falls to zero it can 229counting, and it works such that once the counter falls to zero it can
230be guaranteed that no other entity can be accessing the object: 230be guaranteed that no other entity can be accessing the object:
231 231
232static void obj_list_add(struct obj *obj) 232static void obj_list_add(struct obj *obj, struct list_head *head)
233{ 233{
234 obj->active = 1; 234 obj->active = 1;
235 list_add(&obj->list); 235 list_add(&obj->list, head);
236} 236}
237 237
238static void obj_list_del(struct obj *obj) 238static void obj_list_del(struct obj *obj)
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
index e8ca040ba2cf..2d735b0ae383 100644
--- a/Documentation/block/data-integrity.txt
+++ b/Documentation/block/data-integrity.txt
@@ -50,7 +50,7 @@ encouraged them to allow separation of the data and integrity metadata
50scatter-gather lists. 50scatter-gather lists.
51 51
52The controller will interleave the buffers on write and split them on 52The controller will interleave the buffers on write and split them on
53read. This means that the Linux can DMA the data buffers to and from 53read. This means that Linux can DMA the data buffers to and from
54host memory without changes to the page cache. 54host memory without changes to the page cache.
55 55
56Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs 56Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
@@ -66,7 +66,7 @@ software RAID5).
66 66
67The IP checksum is weaker than the CRC in terms of detecting bit 67The IP checksum is weaker than the CRC in terms of detecting bit
68errors. However, the strength is really in the separation of the data 68errors. However, the strength is really in the separation of the data
69buffers and the integrity metadata. These two distinct buffers much 69buffers and the integrity metadata. These two distinct buffers must
70match up for an I/O to complete. 70match up for an I/O to complete.
71 71
72The separation of the data and integrity metadata buffers as well as 72The separation of the data and integrity metadata buffers as well as
diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt
new file mode 100644
index 000000000000..34916a46c099
--- /dev/null
+++ b/Documentation/btmrvl.txt
@@ -0,0 +1,119 @@
1=======================================================================
2 README for btmrvl driver
3=======================================================================
4
5
6All commands are used via debugfs interface.
7
8=====================
9Set/get driver configurations:
10
11Path: /debug/btmrvl/config/
12
13gpiogap=[n]
14hscfgcmd
15 These commands are used to configure the host sleep parameters.
16 bit 8:0 -- Gap
17 bit 16:8 -- GPIO
18
19 where GPIO is the pin number of GPIO used to wake up the host.
20 It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
21 wakeup will be used instead).
22
23 where Gap is the gap in milli seconds between wakeup signal and
24 wakeup event, or 0xff for special host sleep setting.
25
26 Usage:
27 # Use SDIO interface to wake up the host and set GAP to 0x80:
28 echo 0xff80 > /debug/btmrvl/config/gpiogap
29 echo 1 > /debug/btmrvl/config/hscfgcmd
30
31 # Use GPIO pin #3 to wake up the host and set GAP to 0xff:
32 echo 0x03ff > /debug/btmrvl/config/gpiogap
33 echo 1 > /debug/btmrvl/config/hscfgcmd
34
35psmode=[n]
36pscmd
37 These commands are used to enable/disable auto sleep mode
38
39 where the option is:
40 1 -- Enable auto sleep mode
41 0 -- Disable auto sleep mode
42
43 Usage:
44 # Enable auto sleep mode
45 echo 1 > /debug/btmrvl/config/psmode
46 echo 1 > /debug/btmrvl/config/pscmd
47
48 # Disable auto sleep mode
49 echo 0 > /debug/btmrvl/config/psmode
50 echo 1 > /debug/btmrvl/config/pscmd
51
52
53hsmode=[n]
54hscmd
55 These commands are used to enable host sleep or wake up firmware
56
57 where the option is:
58 1 -- Enable host sleep
59 0 -- Wake up firmware
60
61 Usage:
62 # Enable host sleep
63 echo 1 > /debug/btmrvl/config/hsmode
64 echo 1 > /debug/btmrvl/config/hscmd
65
66 # Wake up firmware
67 echo 0 > /debug/btmrvl/config/hsmode
68 echo 1 > /debug/btmrvl/config/hscmd
69
70
71======================
72Get driver status:
73
74Path: /debug/btmrvl/status/
75
76Usage:
77 cat /debug/btmrvl/status/<args>
78
79where the args are:
80
81curpsmode
82 This command displays current auto sleep status.
83
84psstate
85 This command display the power save state.
86
87hsstate
88 This command display the host sleep state.
89
90txdnldrdy
91 This command displays the value of Tx download ready flag.
92
93
94=====================
95
96Use hcitool to issue raw hci command, refer to hcitool manual
97
98 Usage: Hcitool cmd <ogf> <ocf> [Parameters]
99
100 Interface Control Command
101 hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface
102 hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface
103 hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface
104 hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface
105 hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface
106 hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface
107
108=======================================================================
109
110
111SD8688 firmware:
112
113/lib/firmware/sd8688_helper.bin
114/lib/firmware/sd8688.bin
115
116
117The images can be downloaded from:
118
119git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
index cf1f8126991c..1c407778c8b2 100644
--- a/Documentation/cdrom/packet-writing.txt
+++ b/Documentation/cdrom/packet-writing.txt
@@ -117,7 +117,7 @@ Using the pktcdvd debugfs interface
117 117
118To read pktcdvd device infos in human readable form, do: 118To read pktcdvd device infos in human readable form, do:
119 119
120 # cat /debug/pktcdvd/pktcdvd[0-7]/info 120 # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
121 121
122For a description of the debugfs interface look into the file: 122For a description of the debugfs interface look into the file:
123 123
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index f9ca389dddf4..1d7e9784439a 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -777,6 +777,18 @@ in cpuset directories:
777# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 777# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4
778# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 778# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4
779 779
780To add a CPU to a cpuset, write the new list of CPUs including the
781CPU to be added. To add 6 to the above cpuset:
782
783# /bin/echo 1-4,6 > cpus -> set cpus list to cpus 1,2,3,4,6
784
785Similarly to remove a CPU from a cpuset, write the new list of CPUs
786without the CPU to be removed.
787
788To remove all the CPUs:
789
790# /bin/echo "" > cpus -> clear cpus list
791
7802.3 Setting flags 7922.3 Setting flags
781----------------- 793-----------------
782 794
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 1a608877b14e..23d1262c0775 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -152,14 +152,19 @@ When swap is accounted, following files are added.
152 152
153usage of mem+swap is limited by memsw.limit_in_bytes. 153usage of mem+swap is limited by memsw.limit_in_bytes.
154 154
155Note: why 'mem+swap' rather than swap. 155* why 'mem+swap' rather than swap.
156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means 156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
157to move account from memory to swap...there is no change in usage of 157to move account from memory to swap...there is no change in usage of
158mem+swap. 158mem+swap. In other words, when we want to limit the usage of swap without
159affecting global LRU, mem+swap limit is better than just limiting swap from
160OS point of view.
159 161
160In other words, when we want to limit the usage of swap without affecting 162* What happens when a cgroup hits memory.memsw.limit_in_bytes
161global LRU, mem+swap limit is better than just limiting swap from OS point 163When a cgroup his memory.memsw.limit_in_bytes, it's useless to do swap-out
162of view. 164in this cgroup. Then, swap-out will not be done by cgroup routine and file
165caches are dropped. But as mentioned above, global LRU can do swapout memory
166from it for sanity of the system's memory management state. You can't forbid
167it by cgroup.
163 168
1642.5 Reclaim 1692.5 Reclaim
165 170
@@ -204,6 +209,7 @@ We can alter the memory limit:
204 209
205NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 210NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
206mega or gigabytes. 211mega or gigabytes.
212NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
207 213
208# cat /cgroups/0/memory.limit_in_bytes 214# cat /cgroups/0/memory.limit_in_bytes
2094194304 2154194304
diff --git a/Documentation/connector/Makefile b/Documentation/connector/Makefile
index 8df1a7285a06..d98e4df98e24 100644
--- a/Documentation/connector/Makefile
+++ b/Documentation/connector/Makefile
@@ -9,3 +9,8 @@ hostprogs-y := ucon
9always := $(hostprogs-y) 9always := $(hostprogs-y)
10 10
11HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include 11HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include
12
13all: modules
14
15modules clean:
16 $(MAKE) -C ../.. SUBDIRS=$(PWD) $@
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c
index 6977c178729a..1711adc33373 100644
--- a/Documentation/connector/cn_test.c
+++ b/Documentation/connector/cn_test.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * cn_test.c 2 * cn_test.c
3 * 3 *
4 * 2004-2005 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru> 4 * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -19,6 +19,8 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#define pr_fmt(fmt) "cn_test: " fmt
23
22#include <linux/kernel.h> 24#include <linux/kernel.h>
23#include <linux/module.h> 25#include <linux/module.h>
24#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
@@ -27,20 +29,25 @@
27 29
28#include <linux/connector.h> 30#include <linux/connector.h>
29 31
30static struct cb_id cn_test_id = { 0x123, 0x456 }; 32static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 };
31static char cn_test_name[] = "cn_test"; 33static char cn_test_name[] = "cn_test";
32static struct sock *nls; 34static struct sock *nls;
33static struct timer_list cn_test_timer; 35static struct timer_list cn_test_timer;
34 36
35void cn_test_callback(void *data) 37static void cn_test_callback(struct cn_msg *msg)
36{ 38{
37 struct cn_msg *msg = (struct cn_msg *)data; 39 pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n",
38 40 __func__, jiffies, msg->id.idx, msg->id.val,
39 printk("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n", 41 msg->seq, msg->ack, msg->len,
40 __func__, jiffies, msg->id.idx, msg->id.val, 42 msg->len ? (char *)msg->data : "");
41 msg->seq, msg->ack, msg->len, (char *)msg->data);
42} 43}
43 44
45/*
46 * Do not remove this function even if no one is using it as
47 * this is an example of how to get notifications about new
48 * connector user registration
49 */
50#if 0
44static int cn_test_want_notify(void) 51static int cn_test_want_notify(void)
45{ 52{
46 struct cn_ctl_msg *ctl; 53 struct cn_ctl_msg *ctl;
@@ -57,9 +64,7 @@ static int cn_test_want_notify(void)
57 64
58 skb = alloc_skb(size, GFP_ATOMIC); 65 skb = alloc_skb(size, GFP_ATOMIC);
59 if (!skb) { 66 if (!skb) {
60 printk(KERN_ERR "Failed to allocate new skb with size=%u.\n", 67 pr_err("failed to allocate new skb with size=%u\n", size);
61 size);
62
63 return -ENOMEM; 68 return -ENOMEM;
64 } 69 }
65 70
@@ -108,15 +113,16 @@ static int cn_test_want_notify(void)
108 //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC); 113 //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC);
109 netlink_unicast(nls, skb, 0, 0); 114 netlink_unicast(nls, skb, 0, 0);
110 115
111 printk(KERN_INFO "Request was sent. Group=0x%x.\n", ctl->group); 116 pr_info("request was sent: group=0x%x\n", ctl->group);
112 117
113 return 0; 118 return 0;
114 119
115nlmsg_failure: 120nlmsg_failure:
116 printk(KERN_ERR "Failed to send %u.%u\n", msg->seq, msg->ack); 121 pr_err("failed to send %u.%u\n", msg->seq, msg->ack);
117 kfree_skb(skb); 122 kfree_skb(skb);
118 return -EINVAL; 123 return -EINVAL;
119} 124}
125#endif
120 126
121static u32 cn_test_timer_counter; 127static u32 cn_test_timer_counter;
122static void cn_test_timer_func(unsigned long __data) 128static void cn_test_timer_func(unsigned long __data)
@@ -124,6 +130,8 @@ static void cn_test_timer_func(unsigned long __data)
124 struct cn_msg *m; 130 struct cn_msg *m;
125 char data[32]; 131 char data[32];
126 132
133 pr_debug("%s: timer fired with data %lu\n", __func__, __data);
134
127 m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); 135 m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
128 if (m) { 136 if (m) {
129 137
@@ -143,7 +151,7 @@ static void cn_test_timer_func(unsigned long __data)
143 151
144 cn_test_timer_counter++; 152 cn_test_timer_counter++;
145 153
146 mod_timer(&cn_test_timer, jiffies + HZ); 154 mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
147} 155}
148 156
149static int cn_test_init(void) 157static int cn_test_init(void)
@@ -161,8 +169,10 @@ static int cn_test_init(void)
161 } 169 }
162 170
163 setup_timer(&cn_test_timer, cn_test_timer_func, 0); 171 setup_timer(&cn_test_timer, cn_test_timer_func, 0);
164 cn_test_timer.expires = jiffies + HZ; 172 mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
165 add_timer(&cn_test_timer); 173
174 pr_info("initialized with id={%u.%u}\n",
175 cn_test_id.idx, cn_test_id.val);
166 176
167 return 0; 177 return 0;
168 178
@@ -187,5 +197,5 @@ module_init(cn_test_init);
187module_exit(cn_test_fini); 197module_exit(cn_test_fini);
188 198
189MODULE_LICENSE("GPL"); 199MODULE_LICENSE("GPL");
190MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>"); 200MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
191MODULE_DESCRIPTION("Connector's test module"); 201MODULE_DESCRIPTION("Connector's test module");
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
index ad6e0ba7b38c..81e6bf6ead57 100644
--- a/Documentation/connector/connector.txt
+++ b/Documentation/connector/connector.txt
@@ -5,10 +5,10 @@ Kernel Connector.
5Kernel connector - new netlink based userspace <-> kernel space easy 5Kernel connector - new netlink based userspace <-> kernel space easy
6to use communication module. 6to use communication module.
7 7
8Connector driver adds possibility to connect various agents using 8The Connector driver makes it easy to connect various agents using a
9netlink based network. One must register callback and 9netlink based network. One must register a callback and an identifier.
10identifier. When driver receives special netlink message with 10When the driver receives a special netlink message with the appropriate
11appropriate identifier, appropriate callback will be called. 11identifier, the appropriate callback will be called.
12 12
13From the userspace point of view it's quite straightforward: 13From the userspace point of view it's quite straightforward:
14 14
@@ -17,10 +17,10 @@ From the userspace point of view it's quite straightforward:
17 send(); 17 send();
18 recv(); 18 recv();
19 19
20But if kernelspace want to use full power of such connections, driver 20But if kernelspace wants to use the full power of such connections, the
21writer must create special sockets, must know about struct sk_buff 21driver writer must create special sockets, must know about struct sk_buff
22handling... Connector allows any kernelspace agents to use netlink 22handling, etc... The Connector driver allows any kernelspace agents to use
23based networking for inter-process communication in a significantly 23netlink based networking for inter-process communication in a significantly
24easier way: 24easier way:
25 25
26int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); 26int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *));
@@ -32,15 +32,15 @@ struct cb_id
32 __u32 val; 32 __u32 val;
33}; 33};
34 34
35idx and val are unique identifiers which must be registered in 35idx and val are unique identifiers which must be registered in the
36connector.h for in-kernel usage. void (*callback) (void *) - is a 36connector.h header for in-kernel usage. void (*callback) (void *) is a
37callback function which will be called when message with above idx.val 37callback function which will be called when a message with above idx.val
38will be received by connector core. Argument for that function must 38is received by the connector core. The argument for that function must
39be dereferenced to struct cn_msg *. 39be dereferenced to struct cn_msg *.
40 40
41struct cn_msg 41struct cn_msg
42{ 42{
43 struct cb_id id; 43 struct cb_id id;
44 44
45 __u32 seq; 45 __u32 seq;
46 __u32 ack; 46 __u32 ack;
@@ -55,92 +55,95 @@ Connector interfaces.
55 55
56int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); 56int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *));
57 57
58Registers new callback with connector core. 58 Registers new callback with connector core.
59 59
60struct cb_id *id - unique connector's user identifier. 60 struct cb_id *id - unique connector's user identifier.
61 It must be registered in connector.h for legal in-kernel users. 61 It must be registered in connector.h for legal in-kernel users.
62char *name - connector's callback symbolic name. 62 char *name - connector's callback symbolic name.
63void (*callback) (void *) - connector's callback. 63 void (*callback) (void *) - connector's callback.
64 Argument must be dereferenced to struct cn_msg *. 64 Argument must be dereferenced to struct cn_msg *.
65 65
66
66void cn_del_callback(struct cb_id *id); 67void cn_del_callback(struct cb_id *id);
67 68
68Unregisters new callback with connector core. 69 Unregisters new callback with connector core.
70
71 struct cb_id *id - unique connector's user identifier.
69 72
70struct cb_id *id - unique connector's user identifier.
71 73
72int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask); 74int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
73 75
74Sends message to the specified groups. It can be safely called from 76 Sends message to the specified groups. It can be safely called from
75softirq context, but may silently fail under strong memory pressure. 77 softirq context, but may silently fail under strong memory pressure.
76If there are no listeners for given group -ESRCH can be returned. 78 If there are no listeners for given group -ESRCH can be returned.
77 79
78struct cn_msg * - message header(with attached data). 80 struct cn_msg * - message header(with attached data).
79u32 __group - destination group. 81 u32 __group - destination group.
80 If __group is zero, then appropriate group will 82 If __group is zero, then appropriate group will
81 be searched through all registered connector users, 83 be searched through all registered connector users,
82 and message will be delivered to the group which was 84 and message will be delivered to the group which was
83 created for user with the same ID as in msg. 85 created for user with the same ID as in msg.
84 If __group is not zero, then message will be delivered 86 If __group is not zero, then message will be delivered
85 to the specified group. 87 to the specified group.
86int gfp_mask - GFP mask. 88 int gfp_mask - GFP mask.
87 89
88Note: When registering new callback user, connector core assigns 90 Note: When registering new callback user, connector core assigns
89netlink group to the user which is equal to it's id.idx. 91 netlink group to the user which is equal to it's id.idx.
90 92
91/*****************************************/ 93/*****************************************/
92Protocol description. 94Protocol description.
93/*****************************************/ 95/*****************************************/
94 96
95Current offers transport layer with fixed header. Recommended 97The current framework offers a transport layer with fixed headers. The
96protocol which uses such header is following: 98recommended protocol which uses such a header is as following:
97 99
98msg->seq and msg->ack are used to determine message genealogy. When 100msg->seq and msg->ack are used to determine message genealogy. When
99someone sends message it puts there locally unique sequence and random 101someone sends a message, they use a locally unique sequence and random
100acknowledge numbers. Sequence number may be copied into 102acknowledge number. The sequence number may be copied into
101nlmsghdr->nlmsg_seq too. 103nlmsghdr->nlmsg_seq too.
102 104
103Sequence number is incremented with each message to be sent. 105The sequence number is incremented with each message sent.
104 106
105If we expect reply to our message, then sequence number in received 107If you expect a reply to the message, then the sequence number in the
106message MUST be the same as in original message, and acknowledge 108received message MUST be the same as in the original message, and the
107number MUST be the same + 1. 109acknowledge number MUST be the same + 1.
108 110
109If we receive message and it's sequence number is not equal to one we 111If we receive a message and its sequence number is not equal to one we
110are expecting, then it is new message. If we receive message and it's 112are expecting, then it is a new message. If we receive a message and
111sequence number is the same as one we are expecting, but it's 113its sequence number is the same as one we are expecting, but its
112acknowledge is not equal acknowledge number in original message + 1, 114acknowledge is not equal to the acknowledge number in the original
113then it is new message. 115message + 1, then it is a new message.
114 116
115Obviously, protocol header contains above id. 117Obviously, the protocol header contains the above id.
116 118
117connector allows event notification in the following form: kernel 119The connector allows event notification in the following form: kernel
118driver or userspace process can ask connector to notify it when 120driver or userspace process can ask connector to notify it when
119selected id's will be turned on or off(registered or unregistered it's 121selected ids will be turned on or off (registered or unregistered its
120callback). It is done by sending special command to connector 122callback). It is done by sending a special command to the connector
121driver(it also registers itself with id={-1, -1}). 123driver (it also registers itself with id={-1, -1}).
122 124
123As example of usage Documentation/connector now contains cn_test.c - 125As example of this usage can be found in the cn_test.c module which
124testing module which uses connector to request notification and to 126uses the connector to request notification and to send messages.
125send messages.
126 127
127/*****************************************/ 128/*****************************************/
128Reliability. 129Reliability.
129/*****************************************/ 130/*****************************************/
130 131
131Netlink itself is not reliable protocol, that means that messages can 132Netlink itself is not a reliable protocol. That means that messages can
132be lost due to memory pressure or process' receiving queue overflowed, 133be lost due to memory pressure or process' receiving queue overflowed,
133so caller is warned must be prepared. That is why struct cn_msg [main 134so caller is warned that it must be prepared. That is why the struct
134connector's message header] contains u32 seq and u32 ack fields. 135cn_msg [main connector's message header] contains u32 seq and u32 ack
136fields.
135 137
136/*****************************************/ 138/*****************************************/
137Userspace usage. 139Userspace usage.
138/*****************************************/ 140/*****************************************/
141
1392.6.14 has a new netlink socket implementation, which by default does not 1422.6.14 has a new netlink socket implementation, which by default does not
140allow to send data to netlink groups other than 1. 143allow people to send data to netlink groups other than 1.
141So, if to use netlink socket (for example using connector) 144So, if you wish to use a netlink socket (for example using connector)
142with different group number userspace application must subscribe to 145with a different group number, the userspace application must subscribe to
143that group. It can be achieved by following pseudocode: 146that group first. It can be achieved by the following pseudocode:
144 147
145s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); 148s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
146 149
@@ -160,8 +163,8 @@ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
160} 163}
161 164
162Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket 165Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
163option. To drop multicast subscription one should call above socket option 166option. To drop a multicast subscription, one should call the above socket
164with NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. 167option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
165 168
1662.6.14 netlink code only allows to select a group which is less or equal to 1692.6.14 netlink code only allows to select a group which is less or equal to
167the maximum group number, which is used at netlink_kernel_create() time. 170the maximum group number, which is used at netlink_kernel_create() time.
diff --git a/Documentation/connector/ucon.c b/Documentation/connector/ucon.c
index d738cde2a8d5..4848db8c71ff 100644
--- a/Documentation/connector/ucon.c
+++ b/Documentation/connector/ucon.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * ucon.c 2 * ucon.c
3 * 3 *
4 * Copyright (c) 2004+ Evgeniy Polyakov <johnpol@2ka.mipt.ru> 4 * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net>
5 * 5 *
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -30,18 +30,24 @@
30 30
31#include <arpa/inet.h> 31#include <arpa/inet.h>
32 32
33#include <stdbool.h>
33#include <stdio.h> 34#include <stdio.h>
34#include <stdlib.h> 35#include <stdlib.h>
35#include <unistd.h> 36#include <unistd.h>
36#include <string.h> 37#include <string.h>
37#include <errno.h> 38#include <errno.h>
38#include <time.h> 39#include <time.h>
40#include <getopt.h>
39 41
40#include <linux/connector.h> 42#include <linux/connector.h>
41 43
42#define DEBUG 44#define DEBUG
43#define NETLINK_CONNECTOR 11 45#define NETLINK_CONNECTOR 11
44 46
47/* Hopefully your userspace connector.h matches this kernel */
48#define CN_TEST_IDX CN_NETLINK_USERS + 3
49#define CN_TEST_VAL 0x456
50
45#ifdef DEBUG 51#ifdef DEBUG
46#define ulog(f, a...) fprintf(stdout, f, ##a) 52#define ulog(f, a...) fprintf(stdout, f, ##a)
47#else 53#else
@@ -83,6 +89,25 @@ static int netlink_send(int s, struct cn_msg *msg)
83 return err; 89 return err;
84} 90}
85 91
92static void usage(void)
93{
94 printf(
95 "Usage: ucon [options] [output file]\n"
96 "\n"
97 "\t-h\tthis help screen\n"
98 "\t-s\tsend buffers to the test module\n"
99 "\n"
100 "The default behavior of ucon is to subscribe to the test module\n"
101 "and wait for state messages. Any ones received are dumped to the\n"
102 "specified output file (or stdout). The test module is assumed to\n"
103 "have an id of {%u.%u}\n"
104 "\n"
105 "If you get no output, then verify the cn_test module id matches\n"
106 "the expected id above.\n"
107 , CN_TEST_IDX, CN_TEST_VAL
108 );
109}
110
86int main(int argc, char *argv[]) 111int main(int argc, char *argv[])
87{ 112{
88 int s; 113 int s;
@@ -94,17 +119,34 @@ int main(int argc, char *argv[])
94 FILE *out; 119 FILE *out;
95 time_t tm; 120 time_t tm;
96 struct pollfd pfd; 121 struct pollfd pfd;
122 bool send_msgs = false;
97 123
98 if (argc < 2) 124 while ((s = getopt(argc, argv, "hs")) != -1) {
99 out = stdout; 125 switch (s) {
100 else { 126 case 's':
101 out = fopen(argv[1], "a+"); 127 send_msgs = true;
128 break;
129
130 case 'h':
131 usage();
132 return 0;
133
134 default:
135 /* getopt() outputs an error for us */
136 usage();
137 return 1;
138 }
139 }
140
141 if (argc != optind) {
142 out = fopen(argv[optind], "a+");
102 if (!out) { 143 if (!out) {
103 ulog("Unable to open %s for writing: %s\n", 144 ulog("Unable to open %s for writing: %s\n",
104 argv[1], strerror(errno)); 145 argv[1], strerror(errno));
105 out = stdout; 146 out = stdout;
106 } 147 }
107 } 148 } else
149 out = stdout;
108 150
109 memset(buf, 0, sizeof(buf)); 151 memset(buf, 0, sizeof(buf));
110 152
@@ -115,9 +157,11 @@ int main(int argc, char *argv[])
115 } 157 }
116 158
117 l_local.nl_family = AF_NETLINK; 159 l_local.nl_family = AF_NETLINK;
118 l_local.nl_groups = 0x123; /* bitmask of requested groups */ 160 l_local.nl_groups = -1; /* bitmask of requested groups */
119 l_local.nl_pid = 0; 161 l_local.nl_pid = 0;
120 162
163 ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL);
164
121 if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { 165 if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
122 perror("bind"); 166 perror("bind");
123 close(s); 167 close(s);
@@ -130,15 +174,15 @@ int main(int argc, char *argv[])
130 setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); 174 setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on));
131 } 175 }
132#endif 176#endif
133 if (0) { 177 if (send_msgs) {
134 int i, j; 178 int i, j;
135 179
136 memset(buf, 0, sizeof(buf)); 180 memset(buf, 0, sizeof(buf));
137 181
138 data = (struct cn_msg *)buf; 182 data = (struct cn_msg *)buf;
139 183
140 data->id.idx = 0x123; 184 data->id.idx = CN_TEST_IDX;
141 data->id.val = 0x456; 185 data->id.val = CN_TEST_VAL;
142 data->seq = seq++; 186 data->seq = seq++;
143 data->ack = 0; 187 data->ack = 0;
144 data->len = 0; 188 data->len = 0;
diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index 43c743903dd7..75a58d14d3cf 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -155,7 +155,7 @@ actual frequency must be determined using the following rules:
155- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal 155- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal
156 target_freq. ("H for highest, but no higher than") 156 target_freq. ("H for highest, but no higher than")
157 157
158Here again the frequency table helper might assist you - see section 3 158Here again the frequency table helper might assist you - see section 2
159for details. 159for details.
160 160
161 161
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index ce73f3eb5ddb..aed082f49d09 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -119,10 +119,6 @@ want the kernel to look at the CPU usage and to make decisions on
119what to do about the frequency. Typically this is set to values of 119what to do about the frequency. Typically this is set to values of
120around '10000' or more. It's default value is (cmp. with users-guide.txt): 120around '10000' or more. It's default value is (cmp. with users-guide.txt):
121transition_latency * 1000 121transition_latency * 1000
122The lowest value you can set is:
123transition_latency * 100 or it may get restricted to a value where it
124makes not sense for the kernel anymore to poll that often which depends
125on your HZ config variable (HZ=1000: max=20000us, HZ=250: max=5000).
126Be aware that transition latency is in ns and sampling_rate is in us, so you 122Be aware that transition latency is in ns and sampling_rate is in us, so you
127get the same sysfs value by default. 123get the same sysfs value by default.
128Sampling rate should always get adjusted considering the transition latency 124Sampling rate should always get adjusted considering the transition latency
@@ -131,14 +127,20 @@ in the bash (as said, 1000 is default), do:
131echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \ 127echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \
132 >ondemand/sampling_rate 128 >ondemand/sampling_rate
133 129
134show_sampling_rate_(min|max): THIS INTERFACE IS DEPRECATED, DON'T USE IT. 130show_sampling_rate_min:
135You can use wider ranges now and the general 131The sampling rate is limited by the HW transition latency:
136cpuinfo_transition_latency variable (cmp. with user-guide.txt) can be 132transition_latency * 100
137used to obtain exactly the same info: 133Or by kernel restrictions:
138show_sampling_rate_min = transtition_latency * 500 / 1000 134If CONFIG_NO_HZ is set, the limit is 10ms fixed.
139show_sampling_rate_max = transtition_latency * 500000 / 1000 135If CONFIG_NO_HZ is not set or no_hz=off boot parameter is used, the
140(divided by 1000 is to illustrate that sampling rate is in us and 136limits depend on the CONFIG_HZ option:
141transition latency is exported ns). 137HZ=1000: min=20000us (20ms)
138HZ=250: min=80000us (80ms)
139HZ=100: min=200000us (200ms)
140The highest value of kernel and HW latency restrictions is shown and
141used as the minimum sampling rate.
142
143show_sampling_rate_max: THIS INTERFACE IS DEPRECATED, DON'T USE IT.
142 144
143up_threshold: defines what the average CPU usage between the samplings 145up_threshold: defines what the average CPU usage between the samplings
144of 'sampling_rate' needs to be for the kernel to make a decision on 146of 'sampling_rate' needs to be for the kernel to make a decision on
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
index 75f41193f3e1..5d5f5fadd1c2 100644
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -31,7 +31,6 @@ Contents:
31 31
323. How to change the CPU cpufreq policy and/or speed 323. How to change the CPU cpufreq policy and/or speed
333.1 Preferred interface: sysfs 333.1 Preferred interface: sysfs
343.2 Deprecated interfaces
35 34
36 35
37 36
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 000000000000..994dd75475a6
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
1Device-Mapper Logging
2=====================
3The device-mapper logging code is used by some of the device-mapper
4RAID targets to track regions of the disk that are not consistent.
5A region (or portion of the address space) of the disk may be
6inconsistent because a RAID stripe is currently being operated on or
7a machine died while the region was being altered. In the case of
8mirrors, a region would be considered dirty/inconsistent while you
9are writing to it because the writes need to be replicated for all
10the legs of the mirror and may not reach the legs at the same time.
11Once all writes are complete, the region is considered clean again.
12
13There is a generic logging interface that the device-mapper RAID
14implementations use to perform logging operations (see
15dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different
16logging implementations are available and provide different
17capabilities. The list includes:
18
19Type Files
20==== =====
21disk drivers/md/dm-log.c
22core drivers/md/dm-log.c
23userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
24
25The "disk" log type
26-------------------
27This log implementation commits the log state to disk. This way, the
28logging state survives reboots/crashes.
29
30The "core" log type
31-------------------
32This log implementation keeps the log state in memory. The log state
33will not survive a reboot or crash, but there may be a small boost in
34performance. This method can also be used if no storage device is
35available for storing log state.
36
37The "userspace" log type
38------------------------
39This log type simply provides a way to export the log API to userspace,
40so log implementations can be done there. This is done by forwarding most
41logging requests to userspace, where a daemon receives and processes the
42request.
43
44The structure used for communication between kernel and userspace are
45located in include/linux/dm-log-userspace.h. Due to the frequency,
46diversity, and 2-way communication nature of the exchanges between
47kernel and userspace, 'connector' is used as the interface for
48communication.
49
50There are currently two userspace log implementations that leverage this
51framework - "clustered_disk" and "clustered_core". These implementations
52provide a cluster-coherent log for shared-storage. Device-mapper mirroring
53can be used in a shared-storage environment when the cluster log implementations
54are employed.
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
new file mode 100644
index 000000000000..f4db2562175c
--- /dev/null
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
1dm-queue-length
2===============
3
4dm-queue-length is a path selector module for device-mapper targets,
5which selects a path with the least number of in-flight I/Os.
6The path selector name is 'queue-length'.
7
8Table parameters for each path: [<repeat_count>]
9 <repeat_count>: The number of I/Os to dispatch using the selected
10 path before switching to the next path.
11 If not given, internal default is used. To check
12 the default value, see the activated table.
13
14Status for each path: <status> <fail-count> <in-flight>
15 <status>: 'A' if the path is active, 'F' if the path is failed.
16 <fail-count>: The number of path failures.
17 <in-flight>: The number of in-flight I/Os on the path.
18
19
20Algorithm
21=========
22
23dm-queue-length increments/decrements 'in-flight' when an I/O is
24dispatched/completed respectively.
25dm-queue-length selects a path with the minimum 'in-flight'.
26
27
28Examples
29========
30In case that 2 paths (sda and sdb) are used with repeat_count == 128.
31
32# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
33 dmsetup create test
34#
35# dmsetup table
36test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
37#
38# dmsetup status
39test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
new file mode 100644
index 000000000000..7d00668e97bb
--- /dev/null
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
1dm-service-time
2===============
3
4dm-service-time is a path selector module for device-mapper targets,
5which selects a path with the shortest estimated service time for
6the incoming I/O.
7
8The service time for each path is estimated by dividing the total size
9of in-flight I/Os on a path with the performance value of the path.
10The performance value is a relative throughput value among all paths
11in a path-group, and it can be specified as a table argument.
12
13The path selector name is 'service-time'.
14
15Table parameters for each path: [<repeat_count> [<relative_throughput>]]
16 <repeat_count>: The number of I/Os to dispatch using the selected
17 path before switching to the next path.
18 If not given, internal default is used. To check
19 the default value, see the activated table.
20 <relative_throughput>: The relative throughput value of the path
21 among all paths in the path-group.
22 The valid range is 0-100.
23 If not given, minimum value '1' is used.
24 If '0' is given, the path isn't selected while
25 other paths having a positive value are available.
26
27Status for each path: <status> <fail-count> <in-flight-size> \
28 <relative_throughput>
29 <status>: 'A' if the path is active, 'F' if the path is failed.
30 <fail-count>: The number of path failures.
31 <in-flight-size>: The size of in-flight I/Os on the path.
32 <relative_throughput>: The relative throughput value of the path
33 among all paths in the path-group.
34
35
36Algorithm
37=========
38
39dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
40dispatched and substracts when completed.
41Basically, dm-service-time selects a path having minimum service time
42which is calculated by:
43
44 ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
45
46However, some optimizations below are used to reduce the calculation
47as much as possible.
48
49 1. If the paths have the same 'relative_throughput', skip
50 the division and just compare the 'in-flight-size'.
51
52 2. If the paths have the same 'in-flight-size', skip the division
53 and just compare the 'relative_throughput'.
54
55 3. If some paths have non-zero 'relative_throughput' and others
56 have zero 'relative_throughput', ignore those paths with zero
57 'relative_throughput'.
58
59If such optimizations can't be applied, calculate service time, and
60compare service time.
61If calculated service time is equal, the path having maximum
62'relative_throughput' may be better. So compare 'relative_throughput'
63then.
64
65
66Examples
67========
68In case that 2 paths (sda and sdb) are used with repeat_count == 128
69and sda has an average throughput 1GB/s and sdb has 4GB/s,
70'relative_throughput' value may be '1' for sda and '4' for sdb.
71
72# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
73 dmsetup create test
74#
75# dmsetup table
76test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
77#
78# dmsetup status
79test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
80
81
82Or '2' for sda and '8' for sdb would be also true.
83
84# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
85 dmsetup create test
86#
87# dmsetup table
88test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
89#
90# dmsetup status
91test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 88519daab6e9..e1efc400bed6 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -152,7 +152,6 @@ piggy.gz
152piggyback 152piggyback
153pnmtologo 153pnmtologo
154ppc_defs.h* 154ppc_defs.h*
155promcon_tbl.c
156pss_boot.h 155pss_boot.h
157qconf 156qconf
158raid6altivec*.c 157raid6altivec*.c
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
index a7cbfff40d07..a124f3126b0d 100644
--- a/Documentation/driver-model/device.txt
+++ b/Documentation/driver-model/device.txt
@@ -162,3 +162,35 @@ device_remove_file(dev,&dev_attr_power);
162 162
163The file name will be 'power' with a mode of 0644 (-rw-r--r--). 163The file name will be 'power' with a mode of 0644 (-rw-r--r--).
164 164
165Word of warning: While the kernel allows device_create_file() and
166device_remove_file() to be called on a device at any time, userspace has
167strict expectations on when attributes get created. When a new device is
168registered in the kernel, a uevent is generated to notify userspace (like
169udev) that a new device is available. If attributes are added after the
170device is registered, then userspace won't get notified and userspace will
171not know about the new attributes.
172
173This is important for device driver that need to publish additional
174attributes for a device at driver probe time. If the device driver simply
175calls device_create_file() on the device structure passed to it, then
176userspace will never be notified of the new attributes. Instead, it should
177probably use class_create() and class->dev_attrs to set up a list of
178desired attributes in the modules_init function, and then in the .probe()
179hook, and then use device_create() to create a new device as a child
180of the probed device. The new device will generate a new uevent and
181properly advertise the new attributes to userspace.
182
183For example, if a driver wanted to add the following attributes:
184struct device_attribute mydriver_attribs[] = {
185 __ATTR(port_count, 0444, port_count_show),
186 __ATTR(serial_number, 0444, serial_number_show),
187 NULL
188};
189
190Then in the module init function is would do:
191 mydriver_class = class_create(THIS_MODULE, "my_attrs");
192 mydriver_class.dev_attr = mydriver_attribs;
193
194And assuming 'dev' is the struct device passed into the probe hook, the driver
195probe function would do something like:
196 create_device(&mydriver_class, dev, chrdev, &private_data, "my_name");
diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt
index 82132169d47a..60120fb3b961 100644
--- a/Documentation/driver-model/driver.txt
+++ b/Documentation/driver-model/driver.txt
@@ -207,8 +207,8 @@ Attributes
207~~~~~~~~~~ 207~~~~~~~~~~
208struct driver_attribute { 208struct driver_attribute {
209 struct attribute attr; 209 struct attribute attr;
210 ssize_t (*show)(struct device_driver *, char * buf, size_t count, loff_t off); 210 ssize_t (*show)(struct device_driver *driver, char *buf);
211 ssize_t (*store)(struct device_driver *, const char * buf, size_t count, loff_t off); 211 ssize_t (*store)(struct device_driver *, const char * buf, size_t count);
212}; 212};
213 213
214Device drivers can export attributes via their sysfs directories. 214Device drivers can export attributes via their sysfs directories.
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index 2f21ecd4c205..3d1b0ab70c8e 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -25,7 +25,7 @@ use IO::Handle;
25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t", 25 "tda10046lifeview", "av7110", "dec2000t", "dec2540t",
26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004", 26 "dec3000s", "vp7041", "dibusb", "nxt2002", "nxt2004",
27 "or51211", "or51132_qam", "or51132_vsb", "bluebird", 27 "or51211", "or51132_qam", "or51132_vsb", "bluebird",
28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2" ); 28 "opera1", "cx231xx", "cx18", "cx23885", "pvrusb2", "mpc718" );
29 29
30# Check args 30# Check args
31syntax() if (scalar(@ARGV) != 1); 31syntax() if (scalar(@ARGV) != 1);
@@ -112,7 +112,7 @@ sub tda10045 {
112 112
113sub tda10046 { 113sub tda10046 {
114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip"; 114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip";
115 my $url = "http://technotrend-online.com/download/software/219/$sourcefile"; 115 my $url = "http://www.tt-download.com/download/updates/219/$sourcefile";
116 my $hash = "6a7e1e2f2644b162ff0502367553c72d"; 116 my $hash = "6a7e1e2f2644b162ff0502367553c72d";
117 my $outfile = "dvb-fe-tda10046.fw"; 117 my $outfile = "dvb-fe-tda10046.fw";
118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -129,8 +129,8 @@ sub tda10046 {
129} 129}
130 130
131sub tda10046lifeview { 131sub tda10046lifeview {
132 my $sourcefile = "Drv_2.11.02.zip"; 132 my $sourcefile = "7%5Cdrv_2.11.02.zip";
133 my $url = "http://www.lifeview.com.tw/drivers/pci_card/FlyDVB-T/$sourcefile"; 133 my $url = "http://www.lifeview.hk/dbimages/document/$sourcefile";
134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0"; 134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0";
135 my $outfile = "dvb-fe-tda10046.fw"; 135 my $outfile = "dvb-fe-tda10046.fw";
136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -317,7 +317,7 @@ sub nxt2002 {
317 317
318sub nxt2004 { 318sub nxt2004 {
319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip"; 319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip";
320 my $url = "http://www.aver.com/support/Drivers/$sourcefile"; 320 my $url = "http://www.avermedia-usa.com/support/Drivers/$sourcefile";
321 my $hash = "111cb885b1e009188346d72acfed024c"; 321 my $hash = "111cb885b1e009188346d72acfed024c";
322 my $outfile = "dvb-fe-nxt2004.fw"; 322 my $outfile = "dvb-fe-nxt2004.fw";
323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -381,6 +381,57 @@ sub cx18 {
381 $allfiles; 381 $allfiles;
382} 382}
383 383
384sub mpc718 {
385 my $archive = 'Yuan MPC718 TV Tuner Card 2.13.10.1016.zip';
386 my $url = "ftp://ftp.work.acer-euro.com/desktop/aspire_idea510/vista/Drivers/$archive";
387 my $fwfile = "dvb-cx18-mpc718-mt352.fw";
388 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
389
390 checkstandard();
391 wgetfile($archive, $url);
392 unzip($archive, $tmpdir);
393
394 my $sourcefile = "$tmpdir/Yuan MPC718 TV Tuner Card 2.13.10.1016/mpc718_32bit/yuanrap.sys";
395 my $found = 0;
396
397 open IN, '<', $sourcefile or die "Couldn't open $sourcefile to extract $fwfile data\n";
398 binmode IN;
399 open OUT, '>', $fwfile;
400 binmode OUT;
401 {
402 # Block scope because we change the line terminator variable $/
403 my $prevlen = 0;
404 my $currlen;
405
406 # Buried in the data segment are 3 runs of almost identical
407 # register-value pairs that end in 0x5d 0x01 which is a "TUNER GO"
408 # command for the MT352.
409 # Pull out the middle run (because it's easy) of register-value
410 # pairs to make the "firmware" file.
411
412 local $/ = "\x5d\x01"; # MT352 "TUNER GO"
413
414 while (<IN>) {
415 $currlen = length($_);
416 if ($prevlen == $currlen && $currlen <= 64) {
417 chop; chop; # Get rid of "TUNER GO"
418 s/^\0\0//; # get rid of leading 00 00 if it's there
419 printf OUT "$_";
420 $found = 1;
421 last;
422 }
423 $prevlen = $currlen;
424 }
425 }
426 close OUT;
427 close IN;
428 if (!$found) {
429 unlink $fwfile;
430 die "Couldn't find valid register-value sequence in $sourcefile for $fwfile\n";
431 }
432 $fwfile;
433}
434
384sub cx23885 { 435sub cx23885 {
385 my $url = "http://linuxtv.org/downloads/firmware/"; 436 my $url = "http://linuxtv.org/downloads/firmware/";
386 437
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 4bc374a14345..079305640790 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -29,16 +29,16 @@ o debugfs entries
29fault-inject-debugfs kernel module provides some debugfs entries for runtime 29fault-inject-debugfs kernel module provides some debugfs entries for runtime
30configuration of fault-injection capabilities. 30configuration of fault-injection capabilities.
31 31
32- /debug/fail*/probability: 32- /sys/kernel/debug/fail*/probability:
33 33
34 likelihood of failure injection, in percent. 34 likelihood of failure injection, in percent.
35 Format: <percent> 35 Format: <percent>
36 36
37 Note that one-failure-per-hundred is a very high error rate 37 Note that one-failure-per-hundred is a very high error rate
38 for some testcases. Consider setting probability=100 and configure 38 for some testcases. Consider setting probability=100 and configure
39 /debug/fail*/interval for such testcases. 39 /sys/kernel/debug/fail*/interval for such testcases.
40 40
41- /debug/fail*/interval: 41- /sys/kernel/debug/fail*/interval:
42 42
43 specifies the interval between failures, for calls to 43 specifies the interval between failures, for calls to
44 should_fail() that pass all the other tests. 44 should_fail() that pass all the other tests.
@@ -46,18 +46,18 @@ configuration of fault-injection capabilities.
46 Note that if you enable this, by setting interval>1, you will 46 Note that if you enable this, by setting interval>1, you will
47 probably want to set probability=100. 47 probably want to set probability=100.
48 48
49- /debug/fail*/times: 49- /sys/kernel/debug/fail*/times:
50 50
51 specifies how many times failures may happen at most. 51 specifies how many times failures may happen at most.
52 A value of -1 means "no limit". 52 A value of -1 means "no limit".
53 53
54- /debug/fail*/space: 54- /sys/kernel/debug/fail*/space:
55 55
56 specifies an initial resource "budget", decremented by "size" 56 specifies an initial resource "budget", decremented by "size"
57 on each call to should_fail(,size). Failure injection is 57 on each call to should_fail(,size). Failure injection is
58 suppressed until "space" reaches zero. 58 suppressed until "space" reaches zero.
59 59
60- /debug/fail*/verbose 60- /sys/kernel/debug/fail*/verbose
61 61
62 Format: { 0 | 1 | 2 } 62 Format: { 0 | 1 | 2 }
63 specifies the verbosity of the messages when failure is 63 specifies the verbosity of the messages when failure is
@@ -65,17 +65,17 @@ configuration of fault-injection capabilities.
65 log line per failure; '2' will print a call trace too -- useful 65 log line per failure; '2' will print a call trace too -- useful
66 to debug the problems revealed by fault injection. 66 to debug the problems revealed by fault injection.
67 67
68- /debug/fail*/task-filter: 68- /sys/kernel/debug/fail*/task-filter:
69 69
70 Format: { 'Y' | 'N' } 70 Format: { 'Y' | 'N' }
71 A value of 'N' disables filtering by process (default). 71 A value of 'N' disables filtering by process (default).
72 Any positive value limits failures to only processes indicated by 72 Any positive value limits failures to only processes indicated by
73 /proc/<pid>/make-it-fail==1. 73 /proc/<pid>/make-it-fail==1.
74 74
75- /debug/fail*/require-start: 75- /sys/kernel/debug/fail*/require-start:
76- /debug/fail*/require-end: 76- /sys/kernel/debug/fail*/require-end:
77- /debug/fail*/reject-start: 77- /sys/kernel/debug/fail*/reject-start:
78- /debug/fail*/reject-end: 78- /sys/kernel/debug/fail*/reject-end:
79 79
80 specifies the range of virtual addresses tested during 80 specifies the range of virtual addresses tested during
81 stacktrace walking. Failure is injected only if some caller 81 stacktrace walking. Failure is injected only if some caller
@@ -84,26 +84,26 @@ configuration of fault-injection capabilities.
84 Default required range is [0,ULONG_MAX) (whole of virtual address space). 84 Default required range is [0,ULONG_MAX) (whole of virtual address space).
85 Default rejected range is [0,0). 85 Default rejected range is [0,0).
86 86
87- /debug/fail*/stacktrace-depth: 87- /sys/kernel/debug/fail*/stacktrace-depth:
88 88
89 specifies the maximum stacktrace depth walked during search 89 specifies the maximum stacktrace depth walked during search
90 for a caller within [require-start,require-end) OR 90 for a caller within [require-start,require-end) OR
91 [reject-start,reject-end). 91 [reject-start,reject-end).
92 92
93- /debug/fail_page_alloc/ignore-gfp-highmem: 93- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
94 94
95 Format: { 'Y' | 'N' } 95 Format: { 'Y' | 'N' }
96 default is 'N', setting it to 'Y' won't inject failures into 96 default is 'N', setting it to 'Y' won't inject failures into
97 highmem/user allocations. 97 highmem/user allocations.
98 98
99- /debug/failslab/ignore-gfp-wait: 99- /sys/kernel/debug/failslab/ignore-gfp-wait:
100- /debug/fail_page_alloc/ignore-gfp-wait: 100- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
101 101
102 Format: { 'Y' | 'N' } 102 Format: { 'Y' | 'N' }
103 default is 'N', setting it to 'Y' will inject failures 103 default is 'N', setting it to 'Y' will inject failures
104 only into non-sleep allocations (GFP_ATOMIC allocations). 104 only into non-sleep allocations (GFP_ATOMIC allocations).
105 105
106- /debug/fail_page_alloc/min-order: 106- /sys/kernel/debug/fail_page_alloc/min-order:
107 107
108 specifies the minimum page allocation order to be injected 108 specifies the minimum page allocation order to be injected
109 failures. 109 failures.
@@ -166,13 +166,13 @@ o Inject slab allocation failures into module init/exit code
166#!/bin/bash 166#!/bin/bash
167 167
168FAILTYPE=failslab 168FAILTYPE=failslab
169echo Y > /debug/$FAILTYPE/task-filter 169echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
170echo 10 > /debug/$FAILTYPE/probability 170echo 10 > /sys/kernel/debug/$FAILTYPE/probability
171echo 100 > /debug/$FAILTYPE/interval 171echo 100 > /sys/kernel/debug/$FAILTYPE/interval
172echo -1 > /debug/$FAILTYPE/times 172echo -1 > /sys/kernel/debug/$FAILTYPE/times
173echo 0 > /debug/$FAILTYPE/space 173echo 0 > /sys/kernel/debug/$FAILTYPE/space
174echo 2 > /debug/$FAILTYPE/verbose 174echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
175echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 175echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
176 176
177faulty_system() 177faulty_system()
178{ 178{
@@ -217,20 +217,20 @@ then
217 exit 1 217 exit 1
218fi 218fi
219 219
220cat /sys/module/$module/sections/.text > /debug/$FAILTYPE/require-start 220cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
221cat /sys/module/$module/sections/.data > /debug/$FAILTYPE/require-end 221cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
222 222
223echo N > /debug/$FAILTYPE/task-filter 223echo N > /sys/kernel/debug/$FAILTYPE/task-filter
224echo 10 > /debug/$FAILTYPE/probability 224echo 10 > /sys/kernel/debug/$FAILTYPE/probability
225echo 100 > /debug/$FAILTYPE/interval 225echo 100 > /sys/kernel/debug/$FAILTYPE/interval
226echo -1 > /debug/$FAILTYPE/times 226echo -1 > /sys/kernel/debug/$FAILTYPE/times
227echo 0 > /debug/$FAILTYPE/space 227echo 0 > /sys/kernel/debug/$FAILTYPE/space
228echo 2 > /debug/$FAILTYPE/verbose 228echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
229echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 229echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
230echo 1 > /debug/$FAILTYPE/ignore-gfp-highmem 230echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
231echo 10 > /debug/$FAILTYPE/stacktrace-depth 231echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
232 232
233trap "echo 0 > /debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT 233trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
234 234
235echo "Injecting errors into the module $module... (interrupt to stop)" 235echo "Injecting errors into the module $module... (interrupt to stop)"
236sleep 1000000 236sleep 1000000
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
index ee277dd204b0..950d5a658cb3 100644
--- a/Documentation/fb/vesafb.txt
+++ b/Documentation/fb/vesafb.txt
@@ -95,7 +95,7 @@ There is no way to change the vesafb video mode and/or timings after
95booting linux. If you are not happy with the 60 Hz refresh rate, you 95booting linux. If you are not happy with the 60 Hz refresh rate, you
96have these options: 96have these options:
97 97
98 * configure and load the DOS-Tools for your the graphics board (if 98 * configure and load the DOS-Tools for the graphics board (if
99 available) and boot linux with loadlin. 99 available) and boot linux with loadlin.
100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none 100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none
101 is available, write a new one! 101 is available, write a new one!
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index edb2f0b07616..fa75220f8d34 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,49 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: PRISM54
10When: 2.6.34
11
12Why: prism54 FullMAC PCI / Cardbus devices used to be supported only by the
13 prism54 wireless driver. After Intersil stopped selling these
14 devices in preference for the newer more flexible SoftMAC devices
15 a SoftMAC device driver was required and prism54 did not support
16 them. The p54pci driver now exists and has been present in the kernel for
17 a while. This driver supports both SoftMAC devices and FullMAC devices.
18 The main difference between these devices was the amount of memory which
19 could be used for the firmware. The SoftMAC devices support a smaller
20 amount of memory. Because of this the SoftMAC firmware fits into FullMAC
21 devices's memory. p54pci supports not only PCI / Cardbus but also USB
22 and SPI. Since p54pci supports all devices prism54 supports
23 you will have a conflict. I'm not quite sure how distributions are
24 handling this conflict right now. prism54 was kept around due to
25 claims users may experience issues when using the SoftMAC driver.
26 Time has passed users have not reported issues. If you use prism54
27 and for whatever reason you cannot use p54pci please let us know!
28 E-mail us at: linux-wireless@vger.kernel.org
29
30 For more information see the p54 wiki page:
31
32 http://wireless.kernel.org/en/users/Drivers/p54
33
34Who: Luis R. Rodriguez <lrodriguez@atheros.com>
35
36---------------------------
37
38What: IRQF_SAMPLE_RANDOM
39Check: IRQF_SAMPLE_RANDOM
40When: July 2009
41
42Why: Many of IRQF_SAMPLE_RANDOM users are technically bogus as entropy
43 sources in the kernel's current entropy model. To resolve this, every
44 input point to the kernel's entropy pool needs to better document the
45 type of entropy source it actually is. This will be replaced with
46 additional add_*_randomness functions in drivers/char/random.c
47
48Who: Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
49
50---------------------------
51
9What: The ieee80211_regdom module parameter 52What: The ieee80211_regdom module parameter
10When: March 2010 / desktop catchup 53When: March 2010 / desktop catchup
11 54
@@ -192,24 +235,6 @@ Who: Len Brown <len.brown@intel.com>
192 235
193--------------------------- 236---------------------------
194 237
195What: libata spindown skipping and warning
196When: Dec 2008
197Why: Some halt(8) implementations synchronize caches for and spin
198 down libata disks because libata didn't use to spin down disk on
199 system halt (only synchronized caches).
200 Spin down on system halt is now implemented. sysfs node
201 /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if
202 spin down support is available.
203 Because issuing spin down command to an already spun down disk
204 makes some disks spin up just to spin down again, libata tracks
205 device spindown status to skip the extra spindown command and
206 warn about it.
207 This is to give userspace tools the time to get updated and will
208 be removed after userspace is reasonably updated.
209Who: Tejun Heo <htejun@gmail.com>
210
211---------------------------
212
213What: i386/x86_64 bzImage symlinks 238What: i386/x86_64 bzImage symlinks
214When: April 2010 239When: April 2010
215 240
@@ -221,31 +246,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
221--------------------------- 246---------------------------
222 247
223What (Why): 248What (Why):
224 - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files
225 (superseded by xt_TOS/xt_tos target & match)
226
227 - "forwarding" header files like ipt_mac.h in
228 include/linux/netfilter_ipv4/ and include/linux/netfilter_ipv6/
229
230 - xt_CONNMARK match revision 0
231 (superseded by xt_CONNMARK match revision 1)
232
233 - xt_MARK target revisions 0 and 1
234 (superseded by xt_MARK match revision 2)
235
236 - xt_connmark match revision 0
237 (superseded by xt_connmark match revision 1)
238
239 - xt_conntrack match revision 0
240 (superseded by xt_conntrack match revision 1)
241
242 - xt_iprange match revision 0,
243 include/linux/netfilter_ipv4/ipt_iprange.h
244 (superseded by xt_iprange match revision 1)
245
246 - xt_mark match revision 0
247 (superseded by xt_mark match revision 1)
248
249 - xt_recent: the old ipt_recent proc dir 249 - xt_recent: the old ipt_recent proc dir
250 (superseded by /proc/net/xt_recent) 250 (superseded by /proc/net/xt_recent)
251 251
@@ -354,16 +354,6 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl>
354 354
355--------------------------- 355---------------------------
356 356
357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
358 i2c_adapter->client_register(), i2c_adapter->client_unregister
359When: 2.6.30
360Check: i2c_attach_client i2c_detach_client
361Why: Deprecated by the new (standard) device driver binding model. Use
362 i2c_driver->probe() and ->remove() instead.
363Who: Jean Delvare <khali@linux-fr.org>
364
365---------------------------
366
367What: fscher and fscpos drivers 357What: fscher and fscpos drivers
368When: June 2009 358When: June 2009
369Why: Deprecated by the new fschmd driver. 359Why: Deprecated by the new fschmd driver.
@@ -390,15 +380,6 @@ Who: Thomas Gleixner <tglx@linutronix.de>
390 380
391----------------------------- 381-----------------------------
392 382
393What: obsolete generic irq defines and typedefs
394When: 2.6.30
395Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
396 have been kept around for migration reasons. After more than two years
397 it's time to remove them finally
398Who: Thomas Gleixner <tglx@linutronix.de>
399
400---------------------------
401
402What: fakephp and associated sysfs files in /sys/bus/pci/slots/ 383What: fakephp and associated sysfs files in /sys/bus/pci/slots/
403When: 2011 384When: 2011
404Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to 385Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
@@ -444,3 +425,37 @@ What: CONFIG_RFKILL_INPUT
444When: 2.6.33 425When: 2.6.33
445Why: Should be implemented in userspace, policy daemon. 426Why: Should be implemented in userspace, policy daemon.
446Who: Johannes Berg <johannes@sipsolutions.net> 427Who: Johannes Berg <johannes@sipsolutions.net>
428
429----------------------------
430
431What: lock_policy_rwsem_* and unlock_policy_rwsem_* will not be
432 exported interface anymore.
433When: 2.6.33
434Why: cpu_policy_rwsem has a new cleaner definition making it local to
435 cpufreq core and contained inside cpufreq.c. Other dependent
436 drivers should not use it in order to safely avoid lockdep issues.
437Who: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
438
439----------------------------
440
441What: sound-slot/service-* module aliases and related clutters in
442 sound/sound_core.c
443When: August 2010
444Why: OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR
445 (14) and requests modules using custom sound-slot/service-*
446 module aliases. The only benefit of doing this is allowing
447 use of custom module aliases which might as well be considered
448 a bug at this point. This preemptive claiming prevents
449 alternative OSS implementations.
450
451 Till the feature is removed, the kernel will be requesting
452 both sound-slot/service-* and the standard char-major-* module
453 aliases and allow turning off the pre-claiming selectively via
454 CONFIG_SOUND_OSS_CORE_PRECLAIM and soundcore.preclaim_oss
455 kernel parameter.
456
457 After the transition phase is complete, both the custom module
458 aliases and switches to disable it will go away. This removal
459 will also allow making ALSA OSS emulation independent of
460 sound_core. The dependency will be broken then too.
461Who: Tejun Heo <tj@kernel.org>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8dd6db76171d..f15621ee5599 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -66,6 +66,10 @@ mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 66 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 67ncpfs.txt
68 - info on Novell Netware(tm) filesystem using NCP protocol. 68 - info on Novell Netware(tm) filesystem using NCP protocol.
69nfs41-server.txt
70 - info on the Linux server implementation of NFSv4 minor version 1.
71nfs-rdma.txt
72 - how to install and setup the Linux NFS/RDMA client and server software.
69nfsroot.txt 73nfsroot.txt
70 - short guide on setting up a diskless box with NFS root filesystem. 74 - short guide on setting up a diskless box with NFS root filesystem.
71nilfs2.txt 75nilfs2.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index bf8080640eba..6208f55c44c3 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -123,6 +123,9 @@ available from the same CVS repository.
123There are user and developer mailing lists available through the v9fs project 123There are user and developer mailing lists available through the v9fs project
124on sourceforge (http://sourceforge.net/projects/v9fs). 124on sourceforge (http://sourceforge.net/projects/v9fs).
125 125
126A stand-alone version of the module (which should build for any 2.6 kernel)
127is available via (http://github.com/ericvh/9p-sac/tree/master)
128
126News and other information is maintained on SWiK (http://swik.net/v9fs). 129News and other information is maintained on SWiK (http://swik.net/v9fs).
127 130
128Bug reports may be issued through the kernel.org bugzilla 131Bug reports may be issued through the kernel.org bugzilla
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3120f8dd2c31..18b9d0ca0630 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -109,27 +109,28 @@ prototypes:
109 109
110locking rules: 110locking rules:
111 All may block. 111 All may block.
112 BKL s_lock s_umount 112 None have BKL
113alloc_inode: no no no 113 s_umount
114destroy_inode: no 114alloc_inode:
115dirty_inode: no (must not sleep) 115destroy_inode:
116write_inode: no 116dirty_inode: (must not sleep)
117drop_inode: no !!!inode_lock!!! 117write_inode:
118delete_inode: no 118drop_inode: !!!inode_lock!!!
119put_super: yes yes no 119delete_inode:
120write_super: no yes read 120put_super: write
121sync_fs: no no read 121write_super: read
122freeze_fs: ? 122sync_fs: read
123unfreeze_fs: ? 123freeze_fs: read
124statfs: no no no 124unfreeze_fs: read
125remount_fs: yes yes maybe (see below) 125statfs: no
126clear_inode: no 126remount_fs: maybe (see below)
127umount_begin: yes no no 127clear_inode:
128show_options: no (vfsmount->sem) 128umount_begin: no
129quota_read: no no no (see below) 129show_options: no (namespace_sem)
130quota_write: no no no (see below) 130quota_read: no (see below)
131 131quota_write: no (see below)
132->remount_fs() will have the s_umount lock if it's already mounted. 132
133->remount_fs() will have the s_umount exclusive lock if it's already mounted.
133When called from get_sb_single, it does NOT have the s_umount lock. 134When called from get_sb_single, it does NOT have the s_umount lock.
134->quota_read() and ->quota_write() functions are both guaranteed to 135->quota_read() and ->quota_write() functions are both guaranteed to
135be the only ones operating on the quota file by the quota code (via 136be the only ones operating on the quota file by the quota code (via
@@ -187,7 +188,7 @@ readpages: no
187write_begin: no locks the page yes 188write_begin: no locks the page yes
188write_end: no yes, unlocks yes 189write_end: no yes, unlocks yes
189perform_write: no n/a yes 190perform_write: no n/a yes
190bmap: yes 191bmap: no
191invalidatepage: no yes 192invalidatepage: no yes
192releasepage: no yes 193releasepage: no yes
193direct_IO: no 194direct_IO: no
diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt
index 12ad6c7f4e50..ffef91c4e0d6 100644
--- a/Documentation/filesystems/afs.txt
+++ b/Documentation/filesystems/afs.txt
@@ -23,15 +23,13 @@ it does support include:
23 23
24 (*) Security (currently only AFS kaserver and KerberosIV tickets). 24 (*) Security (currently only AFS kaserver and KerberosIV tickets).
25 25
26 (*) File reading. 26 (*) File reading and writing.
27 27
28 (*) Automounting. 28 (*) Automounting.
29 29
30It does not yet support the following AFS features: 30 (*) Local caching (via fscache).
31
32 (*) Write support.
33 31
34 (*) Local caching. 32It does not yet support the following AFS features:
35 33
36 (*) pioctl() system call. 34 (*) pioctl() system call.
37 35
@@ -56,7 +54,7 @@ They permit the debugging messages to be turned on dynamically by manipulating
56the masks in the following files: 54the masks in the following files:
57 55
58 /sys/module/af_rxrpc/parameters/debug 56 /sys/module/af_rxrpc/parameters/debug
59 /sys/module/afs/parameters/debug 57 /sys/module/kafs/parameters/debug
60 58
61 59
62===== 60=====
@@ -66,9 +64,9 @@ USAGE
66When inserting the driver modules the root cell must be specified along with a 64When inserting the driver modules the root cell must be specified along with a
67list of volume location server IP addresses: 65list of volume location server IP addresses:
68 66
69 insmod af_rxrpc.o 67 modprobe af_rxrpc
70 insmod rxkad.o 68 modprobe rxkad
71 insmod kafs.o rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91 69 modprobe kafs rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91
72 70
73The first module is the AF_RXRPC network protocol driver. This provides the 71The first module is the AF_RXRPC network protocol driver. This provides the
74RxRPC remote operation protocol and may also be accessed from userspace. See: 72RxRPC remote operation protocol and may also be accessed from userspace. See:
@@ -81,7 +79,7 @@ is the actual filesystem driver for the AFS filesystem.
81Once the module has been loaded, more modules can be added by the following 79Once the module has been loaded, more modules can be added by the following
82procedure: 80procedure:
83 81
84 echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells 82 echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells
85 83
86Where the parameters to the "add" command are the name of a cell and a list of 84Where the parameters to the "add" command are the name of a cell and a list of
87volume location servers within that cell, with the latter separated by colons. 85volume location servers within that cell, with the latter separated by colons.
@@ -101,7 +99,7 @@ The name of the volume can be suffixes with ".backup" or ".readonly" to
101specify connection to only volumes of those types. 99specify connection to only volumes of those types.
102 100
103The name of the cell is optional, and if not given during a mount, then the 101The name of the cell is optional, and if not given during a mount, then the
104named volume will be looked up in the cell specified during insmod. 102named volume will be looked up in the cell specified during modprobe.
105 103
106Additional cells can be added through /proc (see later section). 104Additional cells can be added through /proc (see later section).
107 105
@@ -163,14 +161,14 @@ THE CELL DATABASE
163 161
164The filesystem maintains an internal database of all the cells it knows and the 162The filesystem maintains an internal database of all the cells it knows and the
165IP addresses of the volume location servers for those cells. The cell to which 163IP addresses of the volume location servers for those cells. The cell to which
166the system belongs is added to the database when insmod is performed by the 164the system belongs is added to the database when modprobe is performed by the
167"rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on 165"rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on
168the kernel command line. 166the kernel command line.
169 167
170Further cells can be added by commands similar to the following: 168Further cells can be added by commands similar to the following:
171 169
172 echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells 170 echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells
173 echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells 171 echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells
174 172
175No other cell database operations are available at this time. 173No other cell database operations are available at this time.
176 174
@@ -233,7 +231,7 @@ insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91
233mount -t afs \%root.afs. /afs 231mount -t afs \%root.afs. /afs
234mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/ 232mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/
235 233
236echo add grand.central.org 18.7.14.88:128.2.191.224 > /proc/fs/afs/cells 234echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 > /proc/fs/afs/cells
237mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/ 235mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/
238mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive 236mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive
239mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib 237mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index e055acb6b2d4..67639f905f10 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -322,7 +322,7 @@ an upper limit on the block size imposed by the page size of the kernel,
322so 8kB blocks are only allowed on Alpha systems (and other architectures 322so 8kB blocks are only allowed on Alpha systems (and other architectures
323which support larger pages). 323which support larger pages).
324 324
325There is an upper limit of 32768 subdirectories in a single directory. 325There is an upper limit of 32000 subdirectories in a single directory.
326 326
327There is a "soft" upper limit of about 10-15k files in a single directory 327There is a "soft" upper limit of about 10-15k files in a single directory
328with the current linear linked-list directory implementation. This limit 328with the current linear linked-list directory implementation. This limit
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 608fdba97b72..7be02ac5fa36 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -235,6 +235,10 @@ minixdf Make 'df' act like Minix.
235 235
236debug Extra debugging information is sent to syslog. 236debug Extra debugging information is sent to syslog.
237 237
238abort Simulate the effects of calling ext4_abort() for
239 debugging purposes. This is normally used while
240 remounting a filesystem which is already mounted.
241
238errors=remount-ro Remount the filesystem read-only on an error. 242errors=remount-ro Remount the filesystem read-only on an error.
239errors=continue Keep going on a filesystem error. 243errors=continue Keep going on a filesystem error.
240errors=panic Panic and halt the machine if an error occurs. 244errors=panic Panic and halt the machine if an error occurs.
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt
new file mode 100644
index 000000000000..fd966dc9979a
--- /dev/null
+++ b/Documentation/filesystems/gfs2-uevents.txt
@@ -0,0 +1,100 @@
1 uevents and GFS2
2 ==================
3
4During the lifetime of a GFS2 mount, a number of uevents are generated.
5This document explains what the events are and what they are used
6for (by gfs_controld in gfs2-utils).
7
8A list of GFS2 uevents
9-----------------------
10
111. ADD
12
13The ADD event occurs at mount time. It will always be the first
14uevent generated by the newly created filesystem. If the mount
15is successful, an ONLINE uevent will follow. If it is not successful
16then a REMOVE uevent will follow.
17
18The ADD uevent has two environment variables: SPECTATOR=[0|1]
19and RDONLY=[0|1] that specify the spectator status (a read-only mount
20with no journal assigned), and read-only (with journal assigned) status
21of the filesystem respectively.
22
232. ONLINE
24
25The ONLINE uevent is generated after a successful mount or remount. It
26has the same environment variables as the ADD uevent. The ONLINE
27uevent, along with the two environment variables for spectator and
28RDONLY are a relatively recent addition (2.6.32-rc+) and will not
29be generated by older kernels.
30
313. CHANGE
32
33The CHANGE uevent is used in two places. One is when reporting the
34successful mount of the filesystem by the first node (FIRSTMOUNT=Done).
35This is used as a signal by gfs_controld that it is then ok for other
36nodes in the cluster to mount the filesystem.
37
38The other CHANGE uevent is used to inform of the completion
39of journal recovery for one of the filesystems journals. It has
40two environment variables, JID= which specifies the journal id which
41has just been recovered, and RECOVERY=[Done|Failed] to indicate the
42success (or otherwise) of the operation. These uevents are generated
43for every journal recovered, whether it is during the initial mount
44process or as the result of gfs_controld requesting a specific journal
45recovery via the /sys/fs/gfs2/<fsname>/lock_module/recovery file.
46
47Because the CHANGE uevent was used (in early versions of gfs_controld)
48without checking the environment variables to discover the state, we
49cannot add any more functions to it without running the risk of
50someone using an older version of the user tools and breaking their
51cluster. For this reason the ONLINE uevent was used when adding a new
52uevent for a successful mount or remount.
53
544. OFFLINE
55
56The OFFLINE uevent is only generated due to filesystem errors and is used
57as part of the "withdraw" mechanism. Currently this doesn't give any
58information about what the error is, which is something that needs to
59be fixed.
60
615. REMOVE
62
63The REMOVE uevent is generated at the end of an unsuccessful mount
64or at the end of a umount of the filesystem. All REMOVE uevents will
65have been preceeded by at least an ADD uevent for the same fileystem,
66and unlike the other uevents is generated automatically by the kernel's
67kobject subsystem.
68
69
70Information common to all GFS2 uevents (uevent environment variables)
71----------------------------------------------------------------------
72
731. LOCKTABLE=
74
75The LOCKTABLE is a string, as supplied on the mount command
76line (locktable=) or via fstab. It is used as a filesystem label
77as well as providing the information for a lock_dlm mount to be
78able to join the cluster.
79
802. LOCKPROTO=
81
82The LOCKPROTO is a string, and its value depends on what is set
83on the mount command line, or via fstab. It will be either
84lock_nolock or lock_dlm. In the future other lock managers
85may be supported.
86
873. JOURNALID=
88
89If a journal is in use by the filesystem (journals are not
90assigned for spectator mounts) then this will give the
91numeric journal id in all GFS2 uevents.
92
934. UUID=
94
95With recent versions of gfs2-utils, mkfs.gfs2 writes a UUID
96into the filesystem superblock. If it exists, this will
97be included in every uevent relating to the filesystem.
98
99
100
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
index 6973b980ca2a..3c367c3b3608 100644
--- a/Documentation/filesystems/isofs.txt
+++ b/Documentation/filesystems/isofs.txt
@@ -23,8 +23,13 @@ Mount options unique to the isofs filesystem.
23 map=off Do not map non-Rock Ridge filenames to lower case 23 map=off Do not map non-Rock Ridge filenames to lower case
24 map=normal Map non-Rock Ridge filenames to lower case 24 map=normal Map non-Rock Ridge filenames to lower case
25 map=acorn As map=normal but also apply Acorn extensions if present 25 map=acorn As map=normal but also apply Acorn extensions if present
26 mode=xxx Sets the permissions on files to xxx 26 mode=xxx Sets the permissions on files to xxx unless Rock Ridge
27 dmode=xxx Sets the permissions on directories to xxx 27 extensions set the permissions otherwise
28 dmode=xxx Sets the permissions on directories to xxx unless Rock Ridge
29 extensions set the permissions otherwise
30 overriderockperm Set permissions on files and directories according to
31 'mode' and 'dmode' even though Rock Ridge extensions are
32 present.
28 nojoliet Ignore Joliet extensions if they are present. 33 nojoliet Ignore Joliet extensions if they are present.
29 norock Ignore Rock Ridge extensions if they are present. 34 norock Ignore Rock Ridge extensions if they are present.
30 hide Completely strip hidden files from the file system. 35 hide Completely strip hidden files from the file system.
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs.txt
new file mode 100644
index 000000000000..f50f26ce6cd0
--- /dev/null
+++ b/Documentation/filesystems/nfs.txt
@@ -0,0 +1,98 @@
1
2The NFS client
3==============
4
5The NFS version 2 protocol was first documented in RFC1094 (March 1989).
6Since then two more major releases of NFS have been published, with NFSv3
7being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April
82003).
9
10The Linux NFS client currently supports all the above published versions,
11and work is in progress on adding support for minor version 1 of the NFSv4
12protocol.
13
14The purpose of this document is to provide information on some of the
15upcall interfaces that are used in order to provide the NFS client with
16some of the information that it requires in order to fully comply with
17the NFS spec.
18
19The DNS resolver
20================
21
22NFSv4 allows for one server to refer the NFS client to data that has been
23migrated onto another server by means of the special "fs_locations"
24attribute. See
25 http://tools.ietf.org/html/rfc3530#section-6
26and
27 http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
28
29The fs_locations information can take the form of either an ip address and
30a path, or a DNS hostname and a path. The latter requires the NFS client to
31do a DNS lookup in order to mount the new volume, and hence the need for an
32upcall to allow userland to provide this service.
33
34Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
35/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps:
36
37 (1) The process checks the dns_resolve cache to see if it contains a
38 valid entry. If so, it returns that entry and exits.
39
40 (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
41 (may be changed using the 'nfs.cache_getent' kernel boot parameter)
42 is run, with two arguments:
43 - the cache name, "dns_resolve"
44 - the hostname to resolve
45
46 (3) After looking up the corresponding ip address, the helper script
47 writes the result into the rpc_pipefs pseudo-file
48 '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel'
49 in the following (text) format:
50
51 "<ip address> <hostname> <ttl>\n"
52
53 Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6
54 (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format.
55 <hostname> is identical to the second argument of the helper
56 script, and <ttl> is the 'time to live' of this cache entry (in
57 units of seconds).
58
59 Note: If <ip address> is invalid, say the string "0", then a negative
60 entry is created, which will cause the kernel to treat the hostname
61 as having no valid DNS translation.
62
63
64
65
66A basic sample /sbin/nfs_cache_getent
67=====================================
68
69#!/bin/bash
70#
71ttl=600
72#
73cut=/usr/bin/cut
74getent=/usr/bin/getent
75rpc_pipefs=/var/lib/nfs/rpc_pipefs
76#
77die()
78{
79 echo "Usage: $0 cache_name entry_name"
80 exit 1
81}
82
83[ $# -lt 2 ] && die
84cachename="$1"
85cache_path=${rpc_pipefs}/cache/${cachename}/channel
86
87case "${cachename}" in
88 dns_resolve)
89 name="$2"
90 result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
91 [ -z "${result}" ] && result="0"
92 ;;
93 *)
94 die
95 ;;
96esac
97echo "${result} ${name} ${ttl}" >${cache_path}
98
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cd8717a36271..ffead13f9443 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -5,11 +5,12 @@
5 Bodo Bauer <bb@ricochet.net> 5 Bodo Bauer <bb@ricochet.net>
6 6
72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000
8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009 8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009
9------------------------------------------------------------------------------ 9------------------------------------------------------------------------------
10Version 1.3 Kernel version 2.2.12 10Version 1.3 Kernel version 2.2.12
11 Kernel version 2.4.0-test11-pre4 11 Kernel version 2.4.0-test11-pre4
12------------------------------------------------------------------------------ 12------------------------------------------------------------------------------
13fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
13 14
14Table of Contents 15Table of Contents
15----------------- 16-----------------
@@ -116,7 +117,7 @@ The link self points to the process reading the file system. Each process
116subdirectory has the entries listed in Table 1-1. 117subdirectory has the entries listed in Table 1-1.
117 118
118 119
119Table 1-1: Process specific entries in /proc 120Table 1-1: Process specific entries in /proc
120.............................................................................. 121..............................................................................
121 File Content 122 File Content
122 clear_refs Clears page referenced bits shown in smaps output 123 clear_refs Clears page referenced bits shown in smaps output
@@ -134,46 +135,103 @@ Table 1-1: Process specific entries in /proc
134 status Process status in human readable form 135 status Process status in human readable form
135 wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan 136 wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
136 stack Report full stack trace, enable via CONFIG_STACKTRACE 137 stack Report full stack trace, enable via CONFIG_STACKTRACE
137 smaps Extension based on maps, the rss size for each mapped file 138 smaps a extension based on maps, showing the memory consumption of
139 each mapping
138.............................................................................. 140..............................................................................
139 141
140For example, to get the status information of a process, all you have to do is 142For example, to get the status information of a process, all you have to do is
141read the file /proc/PID/status: 143read the file /proc/PID/status:
142 144
143 >cat /proc/self/status 145 >cat /proc/self/status
144 Name: cat 146 Name: cat
145 State: R (running) 147 State: R (running)
146 Pid: 5452 148 Tgid: 5452
147 PPid: 743 149 Pid: 5452
150 PPid: 743
148 TracerPid: 0 (2.4) 151 TracerPid: 0 (2.4)
149 Uid: 501 501 501 501 152 Uid: 501 501 501 501
150 Gid: 100 100 100 100 153 Gid: 100 100 100 100
151 Groups: 100 14 16 154 FDSize: 256
152 VmSize: 1112 kB 155 Groups: 100 14 16
153 VmLck: 0 kB 156 VmPeak: 5004 kB
154 VmRSS: 348 kB 157 VmSize: 5004 kB
155 VmData: 24 kB 158 VmLck: 0 kB
156 VmStk: 12 kB 159 VmHWM: 476 kB
157 VmExe: 8 kB 160 VmRSS: 476 kB
158 VmLib: 1044 kB 161 VmData: 156 kB
159 SigPnd: 0000000000000000 162 VmStk: 88 kB
160 SigBlk: 0000000000000000 163 VmExe: 68 kB
161 SigIgn: 0000000000000000 164 VmLib: 1412 kB
162 SigCgt: 0000000000000000 165 VmPTE: 20 kb
163 CapInh: 00000000fffffeff 166 Threads: 1
164 CapPrm: 0000000000000000 167 SigQ: 0/28578
165 CapEff: 0000000000000000 168 SigPnd: 0000000000000000
166 169 ShdPnd: 0000000000000000
170 SigBlk: 0000000000000000
171 SigIgn: 0000000000000000
172 SigCgt: 0000000000000000
173 CapInh: 00000000fffffeff
174 CapPrm: 0000000000000000
175 CapEff: 0000000000000000
176 CapBnd: ffffffffffffffff
177 voluntary_ctxt_switches: 0
178 nonvoluntary_ctxt_switches: 1
167 179
168This shows you nearly the same information you would get if you viewed it with 180This shows you nearly the same information you would get if you viewed it with
169the ps command. In fact, ps uses the proc file system to obtain its 181the ps command. In fact, ps uses the proc file system to obtain its
170information. The statm file contains more detailed information about the 182information. But you get a more detailed view of the process by reading the
171process memory usage. Its seven fields are explained in Table 1-2. The stat 183file /proc/PID/status. It fields are described in table 1-2.
172file contains details information about the process itself. Its fields are 184
173explained in Table 1-3. 185The statm file contains more detailed information about the process
186memory usage. Its seven fields are explained in Table 1-3. The stat file
187contains details information about the process itself. Its fields are
188explained in Table 1-4.
174 189
190Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
191..............................................................................
192 Field Content
193 Name filename of the executable
194 State state (R is running, S is sleeping, D is sleeping
195 in an uninterruptible wait, Z is zombie,
196 T is traced or stopped)
197 Tgid thread group ID
198 Pid process id
199 PPid process id of the parent process
200 TracerPid PID of process tracing this process (0 if not)
201 Uid Real, effective, saved set, and file system UIDs
202 Gid Real, effective, saved set, and file system GIDs
203 FDSize number of file descriptor slots currently allocated
204 Groups supplementary group list
205 VmPeak peak virtual memory size
206 VmSize total program size
207 VmLck locked memory size
208 VmHWM peak resident set size ("high water mark")
209 VmRSS size of memory portions
210 VmData size of data, stack, and text segments
211 VmStk size of data, stack, and text segments
212 VmExe size of text segment
213 VmLib size of shared library code
214 VmPTE size of page table entries
215 Threads number of threads
216 SigQ number of signals queued/max. number for queue
217 SigPnd bitmap of pending signals for the thread
218 ShdPnd bitmap of shared pending signals for the process
219 SigBlk bitmap of blocked signals
220 SigIgn bitmap of ignored signals
221 SigCgt bitmap of catched signals
222 CapInh bitmap of inheritable capabilities
223 CapPrm bitmap of permitted capabilities
224 CapEff bitmap of effective capabilities
225 CapBnd bitmap of capabilities bounding set
226 Cpus_allowed mask of CPUs on which this process may run
227 Cpus_allowed_list Same as previous, but in "list format"
228 Mems_allowed mask of memory nodes allowed to this process
229 Mems_allowed_list Same as previous, but in "list format"
230 voluntary_ctxt_switches number of voluntary context switches
231 nonvoluntary_ctxt_switches number of non voluntary context switches
232..............................................................................
175 233
176Table 1-2: Contents of the statm files (as of 2.6.8-rc3) 234Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
177.............................................................................. 235..............................................................................
178 Field Content 236 Field Content
179 size total program size (pages) (same as VmSize in status) 237 size total program size (pages) (same as VmSize in status)
@@ -188,7 +246,7 @@ Table 1-2: Contents of the statm files (as of 2.6.8-rc3)
188.............................................................................. 246..............................................................................
189 247
190 248
191Table 1-3: Contents of the stat files (as of 2.6.22-rc3) 249Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
192.............................................................................. 250..............................................................................
193 Field Content 251 Field Content
194 pid process id 252 pid process id
@@ -222,10 +280,10 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
222 start_stack address of the start of the stack 280 start_stack address of the start of the stack
223 esp current value of ESP 281 esp current value of ESP
224 eip current value of EIP 282 eip current value of EIP
225 pending bitmap of pending signals (obsolete) 283 pending bitmap of pending signals
226 blocked bitmap of blocked signals (obsolete) 284 blocked bitmap of blocked signals
227 sigign bitmap of ignored signals (obsolete) 285 sigign bitmap of ignored signals
228 sigcatch bitmap of catched signals (obsolete) 286 sigcatch bitmap of catched signals
229 wchan address where process went to sleep 287 wchan address where process went to sleep
230 0 (place holder) 288 0 (place holder)
231 0 (place holder) 289 0 (place holder)
@@ -234,19 +292,99 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
234 rt_priority realtime priority 292 rt_priority realtime priority
235 policy scheduling policy (man sched_setscheduler) 293 policy scheduling policy (man sched_setscheduler)
236 blkio_ticks time spent waiting for block IO 294 blkio_ticks time spent waiting for block IO
295 gtime guest time of the task in jiffies
296 cgtime guest time of the task children in jiffies
237.............................................................................. 297..............................................................................
238 298
299The /proc/PID/map file containing the currently mapped memory regions and
300their access permissions.
301
302The format is:
303
304address perms offset dev inode pathname
305
30608048000-08049000 r-xp 00000000 03:00 8312 /opt/test
30708049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3080804a000-0806b000 rw-p 00000000 00:00 0 [heap]
309a7cb1000-a7cb2000 ---p 00000000 00:00 0
310a7cb2000-a7eb2000 rw-p 00000000 00:00 0
311a7eb2000-a7eb3000 ---p 00000000 00:00 0
312a7eb3000-a7ed5000 rw-p 00000000 00:00 0
313a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
314a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
315a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
316a800b000-a800e000 rw-p 00000000 00:00 0
317a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
318a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
319a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
320a8024000-a8027000 rw-p 00000000 00:00 0
321a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
322a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
323a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
324aff35000-aff4a000 rw-p 00000000 00:00 0 [stack]
325ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
326
327where "address" is the address space in the process that it occupies, "perms"
328is a set of permissions:
329
330 r = read
331 w = write
332 x = execute
333 s = shared
334 p = private (copy on write)
335
336"offset" is the offset into the mapping, "dev" is the device (major:minor), and
337"inode" is the inode on that device. 0 indicates that no inode is associated
338with the memory region, as the case would be with BSS (uninitialized data).
339The "pathname" shows the name associated file for this mapping. If the mapping
340is not associated with a file:
341
342 [heap] = the heap of the program
343 [stack] = the stack of the main process
344 [vdso] = the "virtual dynamic shared object",
345 the kernel system call handler
346
347 or if empty, the mapping is anonymous.
348
349
350The /proc/PID/smaps is an extension based on maps, showing the memory
351consumption for each of the process's mappings. For each of mappings there
352is a series of lines such as the following:
353
35408048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash
355Size: 1084 kB
356Rss: 892 kB
357Pss: 374 kB
358Shared_Clean: 892 kB
359Shared_Dirty: 0 kB
360Private_Clean: 0 kB
361Private_Dirty: 0 kB
362Referenced: 892 kB
363Swap: 0 kB
364KernelPageSize: 4 kB
365MMUPageSize: 4 kB
366
367The first of these lines shows the same information as is displayed for the
368mapping in /proc/PID/maps. The remaining lines show the size of the mapping,
369the amount of the mapping that is currently resident in RAM, the "proportional
370set size†(divide each shared page by the number of processes sharing it), the
371number of clean and dirty shared pages in the mapping, and the number of clean
372and dirty private pages in the mapping. The "Referenced" indicates the amount
373of memory currently marked as referenced or accessed.
374
375This file is only present if the CONFIG_MMU kernel configuration option is
376enabled.
239 377
2401.2 Kernel data 3781.2 Kernel data
241--------------- 379---------------
242 380
243Similar to the process entries, the kernel data files give information about 381Similar to the process entries, the kernel data files give information about
244the running kernel. The files used to obtain this information are contained in 382the running kernel. The files used to obtain this information are contained in
245/proc and are listed in Table 1-4. Not all of these will be present in your 383/proc and are listed in Table 1-5. Not all of these will be present in your
246system. It depends on the kernel configuration and the loaded modules, which 384system. It depends on the kernel configuration and the loaded modules, which
247files are there, and which are missing. 385files are there, and which are missing.
248 386
249Table 1-4: Kernel info in /proc 387Table 1-5: Kernel info in /proc
250.............................................................................. 388..............................................................................
251 File Content 389 File Content
252 apm Advanced power management info 390 apm Advanced power management info
@@ -283,6 +421,7 @@ Table 1-4: Kernel info in /proc
283 rtc Real time clock 421 rtc Real time clock
284 scsi SCSI info (see text) 422 scsi SCSI info (see text)
285 slabinfo Slab pool info 423 slabinfo Slab pool info
424 softirqs softirq usage
286 stat Overall statistics 425 stat Overall statistics
287 swaps Swap space utilization 426 swaps Swap space utilization
288 sys See chapter 2 427 sys See chapter 2
@@ -597,6 +736,25 @@ on the kind of area :
5970xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ... 7360xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ...
598 pages=10 vmalloc N0=10 737 pages=10 vmalloc N0=10
599 738
739..............................................................................
740
741softirqs:
742
743Provides counts of softirq handlers serviced since boot time, for each cpu.
744
745> cat /proc/softirqs
746 CPU0 CPU1 CPU2 CPU3
747 HI: 0 0 0 0
748 TIMER: 27166 27120 27097 27034
749 NET_TX: 0 0 0 17
750 NET_RX: 42 0 0 39
751 BLOCK: 0 0 107 1121
752 TASKLET: 0 0 0 290
753 SCHED: 27035 26983 26971 26746
754 HRTIMER: 0 0 0 0
755 RCU: 1678 1769 2178 2250
756
757
6001.3 IDE devices in /proc/ide 7581.3 IDE devices in /proc/ide
601---------------------------- 759----------------------------
602 760
@@ -614,10 +772,10 @@ IDE devices:
614 772
615More detailed information can be found in the controller specific 773More detailed information can be found in the controller specific
616subdirectories. These are named ide0, ide1 and so on. Each of these 774subdirectories. These are named ide0, ide1 and so on. Each of these
617directories contains the files shown in table 1-5. 775directories contains the files shown in table 1-6.
618 776
619 777
620Table 1-5: IDE controller info in /proc/ide/ide? 778Table 1-6: IDE controller info in /proc/ide/ide?
621.............................................................................. 779..............................................................................
622 File Content 780 File Content
623 channel IDE channel (0 or 1) 781 channel IDE channel (0 or 1)
@@ -627,11 +785,11 @@ Table 1-5: IDE controller info in /proc/ide/ide?
627.............................................................................. 785..............................................................................
628 786
629Each device connected to a controller has a separate subdirectory in the 787Each device connected to a controller has a separate subdirectory in the
630controllers directory. The files listed in table 1-6 are contained in these 788controllers directory. The files listed in table 1-7 are contained in these
631directories. 789directories.
632 790
633 791
634Table 1-6: IDE device information 792Table 1-7: IDE device information
635.............................................................................. 793..............................................................................
636 File Content 794 File Content
637 cache The cache 795 cache The cache
@@ -673,12 +831,12 @@ the drive parameters:
6731.4 Networking info in /proc/net 8311.4 Networking info in /proc/net
674-------------------------------- 832--------------------------------
675 833
676The subdirectory /proc/net follows the usual pattern. Table 1-6 shows the 834The subdirectory /proc/net follows the usual pattern. Table 1-8 shows the
677additional values you get for IP version 6 if you configure the kernel to 835additional values you get for IP version 6 if you configure the kernel to
678support this. Table 1-7 lists the files and their meaning. 836support this. Table 1-9 lists the files and their meaning.
679 837
680 838
681Table 1-6: IPv6 info in /proc/net 839Table 1-8: IPv6 info in /proc/net
682.............................................................................. 840..............................................................................
683 File Content 841 File Content
684 udp6 UDP sockets (IPv6) 842 udp6 UDP sockets (IPv6)
@@ -693,7 +851,7 @@ Table 1-6: IPv6 info in /proc/net
693.............................................................................. 851..............................................................................
694 852
695 853
696Table 1-7: Network info in /proc/net 854Table 1-9: Network info in /proc/net
697.............................................................................. 855..............................................................................
698 File Content 856 File Content
699 arp Kernel ARP table 857 arp Kernel ARP table
@@ -817,10 +975,10 @@ The directory /proc/parport contains information about the parallel ports of
817your system. It has one subdirectory for each port, named after the port 975your system. It has one subdirectory for each port, named after the port
818number (0,1,2,...). 976number (0,1,2,...).
819 977
820These directories contain the four files shown in Table 1-8. 978These directories contain the four files shown in Table 1-10.
821 979
822 980
823Table 1-8: Files in /proc/parport 981Table 1-10: Files in /proc/parport
824.............................................................................. 982..............................................................................
825 File Content 983 File Content
826 autoprobe Any IEEE-1284 device ID information that has been acquired. 984 autoprobe Any IEEE-1284 device ID information that has been acquired.
@@ -838,10 +996,10 @@ Table 1-8: Files in /proc/parport
838 996
839Information about the available and actually used tty's can be found in the 997Information about the available and actually used tty's can be found in the
840directory /proc/tty.You'll find entries for drivers and line disciplines in 998directory /proc/tty.You'll find entries for drivers and line disciplines in
841this directory, as shown in Table 1-9. 999this directory, as shown in Table 1-11.
842 1000
843 1001
844Table 1-9: Files in /proc/tty 1002Table 1-11: Files in /proc/tty
845.............................................................................. 1003..............................................................................
846 File Content 1004 File Content
847 drivers list of drivers and their usage 1005 drivers list of drivers and their usage
@@ -883,6 +1041,7 @@ since the system first booted. For a quick look, simply cat the file:
883 processes 2915 1041 processes 2915
884 procs_running 1 1042 procs_running 1
885 procs_blocked 0 1043 procs_blocked 0
1044 softirq 183433 0 21755 12 39 1137 231 21459 2263
886 1045
887The very first "cpu" line aggregates the numbers in all of the other "cpuN" 1046The very first "cpu" line aggregates the numbers in all of the other "cpuN"
888lines. These numbers identify the amount of time the CPU has spent performing 1047lines. These numbers identify the amount of time the CPU has spent performing
@@ -918,6 +1077,11 @@ CPUs.
918The "procs_blocked" line gives the number of processes currently blocked, 1077The "procs_blocked" line gives the number of processes currently blocked,
919waiting for I/O to complete. 1078waiting for I/O to complete.
920 1079
1080The "softirq" line gives counts of softirqs serviced since boot time, for each
1081of the possible system softirqs. The first column is the total of all
1082softirqs serviced; each subsequent column is the total for that particular
1083softirq.
1084
921 1085
9221.9 Ext4 file system parameters 10861.9 Ext4 file system parameters
923------------------------------ 1087------------------------------
@@ -926,9 +1090,9 @@ Information about mounted ext4 file systems can be found in
926/proc/fs/ext4. Each mounted filesystem will have a directory in 1090/proc/fs/ext4. Each mounted filesystem will have a directory in
927/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or 1091/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
928/proc/fs/ext4/dm-0). The files in each per-device directory are shown 1092/proc/fs/ext4/dm-0). The files in each per-device directory are shown
929in Table 1-10, below. 1093in Table 1-12, below.
930 1094
931Table 1-10: Files in /proc/fs/ext4/<devname> 1095Table 1-12: Files in /proc/fs/ext4/<devname>
932.............................................................................. 1096..............................................................................
933 File Content 1097 File Content
934 mb_groups details of multiblock allocator buddy cache of free blocks 1098 mb_groups details of multiblock allocator buddy cache of free blocks
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index b843743aa0b5..0d15ebccf5b0 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -46,7 +46,7 @@ better to do. The file is seekable, in that one can do something like the
46following: 46following:
47 47
48 dd if=/proc/sequence of=out1 count=1 48 dd if=/proc/sequence of=out1 count=1
49 dd if=/proc/sequence skip=1 out=out2 count=1 49 dd if=/proc/sequence skip=1 of=out2 count=1
50 50
51Then concatenate the output files out1 and out2 and get the right 51Then concatenate the output files out1 and out2 and get the right
52result. Yes, it is a thoroughly useless module, but the point is to show 52result. Yes, it is a thoroughly useless module, but the point is to show
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 7e81e37c0b1e..b245d524d568 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -23,7 +23,8 @@ interface.
23Using sysfs 23Using sysfs
24~~~~~~~~~~~ 24~~~~~~~~~~~
25 25
26sysfs is always compiled in. You can access it by doing: 26sysfs is always compiled in if CONFIG_SYSFS is defined. You can access
27it by doing:
27 28
28 mount -t sysfs sysfs /sys 29 mount -t sysfs sysfs /sys
29 30
diff --git a/Documentation/firmware_class/README b/Documentation/firmware_class/README
index c3480aa66ba8..7eceaff63f5f 100644
--- a/Documentation/firmware_class/README
+++ b/Documentation/firmware_class/README
@@ -77,7 +77,8 @@
77 seconds for the whole load operation. 77 seconds for the whole load operation.
78 78
79 - request_firmware_nowait() is also provided for convenience in 79 - request_firmware_nowait() is also provided for convenience in
80 non-user contexts. 80 user contexts to request firmware asynchronously, but can't be called
81 in atomic contexts.
81 82
82 83
83 about in-kernel persistence: 84 about in-kernel persistence:
diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt
new file mode 100644
index 000000000000..84eb26808dee
--- /dev/null
+++ b/Documentation/flexible-arrays.txt
@@ -0,0 +1,99 @@
1Using flexible arrays in the kernel
2Last updated for 2.6.31
3Jonathan Corbet <corbet@lwn.net>
4
5Large contiguous memory allocations can be unreliable in the Linux kernel.
6Kernel programmers will sometimes respond to this problem by allocating
7pages with vmalloc(). This solution not ideal, though. On 32-bit systems,
8memory from vmalloc() must be mapped into a relatively small address space;
9it's easy to run out. On SMP systems, the page table changes required by
10vmalloc() allocations can require expensive cross-processor interrupts on
11all CPUs. And, on all systems, use of space in the vmalloc() range
12increases pressure on the translation lookaside buffer (TLB), reducing the
13performance of the system.
14
15In many cases, the need for memory from vmalloc() can be eliminated by
16piecing together an array from smaller parts; the flexible array library
17exists to make this task easier.
18
19A flexible array holds an arbitrary (within limits) number of fixed-sized
20objects, accessed via an integer index. Sparse arrays are handled
21reasonably well. Only single-page allocations are made, so memory
22allocation failures should be relatively rare. The down sides are that the
23arrays cannot be indexed directly, individual object size cannot exceed the
24system page size, and putting data into a flexible array requires a copy
25operation. It's also worth noting that flexible arrays do no internal
26locking at all; if concurrent access to an array is possible, then the
27caller must arrange for appropriate mutual exclusion.
28
29The creation of a flexible array is done with:
30
31 #include <linux/flex_array.h>
32
33 struct flex_array *flex_array_alloc(int element_size,
34 unsigned int total,
35 gfp_t flags);
36
37The individual object size is provided by element_size, while total is the
38maximum number of objects which can be stored in the array. The flags
39argument is passed directly to the internal memory allocation calls. With
40the current code, using flags to ask for high memory is likely to lead to
41notably unpleasant side effects.
42
43Storing data into a flexible array is accomplished with a call to:
44
45 int flex_array_put(struct flex_array *array, unsigned int element_nr,
46 void *src, gfp_t flags);
47
48This call will copy the data from src into the array, in the position
49indicated by element_nr (which must be less than the maximum specified when
50the array was created). If any memory allocations must be performed, flags
51will be used. The return value is zero on success, a negative error code
52otherwise.
53
54There might possibly be a need to store data into a flexible array while
55running in some sort of atomic context; in this situation, sleeping in the
56memory allocator would be a bad thing. That can be avoided by using
57GFP_ATOMIC for the flags value, but, often, there is a better way. The
58trick is to ensure that any needed memory allocations are done before
59entering atomic context, using:
60
61 int flex_array_prealloc(struct flex_array *array, unsigned int start,
62 unsigned int end, gfp_t flags);
63
64This function will ensure that memory for the elements indexed in the range
65defined by start and end has been allocated. Thereafter, a
66flex_array_put() call on an element in that range is guaranteed not to
67block.
68
69Getting data back out of the array is done with:
70
71 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
72
73The return value is a pointer to the data element, or NULL if that
74particular element has never been allocated.
75
76Note that it is possible to get back a valid pointer for an element which
77has never been stored in the array. Memory for array elements is allocated
78one page at a time; a single allocation could provide memory for several
79adjacent elements. The flexible array code does not know if a specific
80element has been written; it only knows if the associated memory is
81present. So a flex_array_get() call on an element which was never stored
82in the array has the potential to return a pointer to random data. If the
83caller does not have a separate way to know which elements were actually
84stored, it might be wise, at least, to add GFP_ZERO to the flags argument
85to ensure that all elements are zeroed.
86
87There is no way to remove a single element from the array. It is possible,
88though, to remove all elements with a call to:
89
90 void flex_array_free_parts(struct flex_array *array);
91
92This call frees all elements, but leaves the array itself in place.
93Freeing the entire array is done with:
94
95 void flex_array_free(struct flex_array *array);
96
97As of this writing, there are no users of flexible arrays in the mainline
98kernel. The functions described here are also not exported to modules;
99that will probably be fixed when somebody comes up with a need for it.
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
new file mode 100644
index 000000000000..40ec63352760
--- /dev/null
+++ b/Documentation/gcov.txt
@@ -0,0 +1,253 @@
1Using gcov with the Linux kernel
2================================
3
41. Introduction
52. Preparation
63. Customization
74. Files
85. Modules
96. Separated build and test machines
107. Troubleshooting
11Appendix A: sample script: gather_on_build.sh
12Appendix B: sample script: gather_on_test.sh
13
14
151. Introduction
16===============
17
18gcov profiling kernel support enables the use of GCC's coverage testing
19tool gcov [1] with the Linux kernel. Coverage data of a running kernel
20is exported in gcov-compatible format via the "gcov" debugfs directory.
21To get coverage data for a specific file, change to the kernel build
22directory and use gcov with the -o option as follows (requires root):
23
24# cd /tmp/linux-out
25# gcov -o /sys/kernel/debug/gcov/tmp/linux-out/kernel spinlock.c
26
27This will create source code files annotated with execution counts
28in the current directory. In addition, graphical gcov front-ends such
29as lcov [2] can be used to automate the process of collecting data
30for the entire kernel and provide coverage overviews in HTML format.
31
32Possible uses:
33
34* debugging (has this line been reached at all?)
35* test improvement (how do I change my test to cover these lines?)
36* minimizing kernel configurations (do I need this option if the
37 associated code is never run?)
38
39--
40
41[1] http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
42[2] http://ltp.sourceforge.net/coverage/lcov.php
43
44
452. Preparation
46==============
47
48Configure the kernel with:
49
50 CONFIG_DEBUGFS=y
51 CONFIG_GCOV_KERNEL=y
52
53and to get coverage data for the entire kernel:
54
55 CONFIG_GCOV_PROFILE_ALL=y
56
57Note that kernels compiled with profiling flags will be significantly
58larger and run slower. Also CONFIG_GCOV_PROFILE_ALL may not be supported
59on all architectures.
60
61Profiling data will only become accessible once debugfs has been
62mounted:
63
64 mount -t debugfs none /sys/kernel/debug
65
66
673. Customization
68================
69
70To enable profiling for specific files or directories, add a line
71similar to the following to the respective kernel Makefile:
72
73 For a single file (e.g. main.o):
74 GCOV_PROFILE_main.o := y
75
76 For all files in one directory:
77 GCOV_PROFILE := y
78
79To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
80is specified, use:
81
82 GCOV_PROFILE_main.o := n
83 and:
84 GCOV_PROFILE := n
85
86Only files which are linked to the main kernel image or are compiled as
87kernel modules are supported by this mechanism.
88
89
904. Files
91========
92
93The gcov kernel support creates the following files in debugfs:
94
95 /sys/kernel/debug/gcov
96 Parent directory for all gcov-related files.
97
98 /sys/kernel/debug/gcov/reset
99 Global reset file: resets all coverage data to zero when
100 written to.
101
102 /sys/kernel/debug/gcov/path/to/compile/dir/file.gcda
103 The actual gcov data file as understood by the gcov
104 tool. Resets file coverage data to zero when written to.
105
106 /sys/kernel/debug/gcov/path/to/compile/dir/file.gcno
107 Symbolic link to a static data file required by the gcov
108 tool. This file is generated by gcc when compiling with
109 option -ftest-coverage.
110
111
1125. Modules
113==========
114
115Kernel modules may contain cleanup code which is only run during
116module unload time. The gcov mechanism provides a means to collect
117coverage data for such code by keeping a copy of the data associated
118with the unloaded module. This data remains available through debugfs.
119Once the module is loaded again, the associated coverage counters are
120initialized with the data from its previous instantiation.
121
122This behavior can be deactivated by specifying the gcov_persist kernel
123parameter:
124
125 gcov_persist=0
126
127At run-time, a user can also choose to discard data for an unloaded
128module by writing to its data file or the global reset file.
129
130
1316. Separated build and test machines
132====================================
133
134The gcov kernel profiling infrastructure is designed to work out-of-the
135box for setups where kernels are built and run on the same machine. In
136cases where the kernel runs on a separate machine, special preparations
137must be made, depending on where the gcov tool is used:
138
139a) gcov is run on the TEST machine
140
141The gcov tool version on the test machine must be compatible with the
142gcc version used for kernel build. Also the following files need to be
143copied from build to test machine:
144
145from the source tree:
146 - all C source files + headers
147
148from the build tree:
149 - all C source files + headers
150 - all .gcda and .gcno files
151 - all links to directories
152
153It is important to note that these files need to be placed into the
154exact same file system location on the test machine as on the build
155machine. If any of the path components is symbolic link, the actual
156directory needs to be used instead (due to make's CURDIR handling).
157
158b) gcov is run on the BUILD machine
159
160The following files need to be copied after each test case from test
161to build machine:
162
163from the gcov directory in sysfs:
164 - all .gcda files
165 - all links to .gcno files
166
167These files can be copied to any location on the build machine. gcov
168must then be called with the -o option pointing to that directory.
169
170Example directory setup on the build machine:
171
172 /tmp/linux: kernel source tree
173 /tmp/out: kernel build directory as specified by make O=
174 /tmp/coverage: location of the files copied from the test machine
175
176 [user@build] cd /tmp/out
177 [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
178
179
1807. Troubleshooting
181==================
182
183Problem: Compilation aborts during linker step.
184Cause: Profiling flags are specified for source files which are not
185 linked to the main kernel or which are linked by a custom
186 linker procedure.
187Solution: Exclude affected source files from profiling by specifying
188 GCOV_PROFILE := n or GCOV_PROFILE_basename.o := n in the
189 corresponding Makefile.
190
191Problem: Files copied from sysfs appear empty or incomplete.
192Cause: Due to the way seq_file works, some tools such as cp or tar
193 may not correctly copy files from sysfs.
194Solution: Use 'cat' to read .gcda files and 'cp -d' to copy links.
195 Alternatively use the mechanism shown in Appendix B.
196
197
198Appendix A: gather_on_build.sh
199==============================
200
201Sample script to gather coverage meta files on the build machine
202(see 6a):
203#!/bin/bash
204
205KSRC=$1
206KOBJ=$2
207DEST=$3
208
209if [ -z "$KSRC" ] || [ -z "$KOBJ" ] || [ -z "$DEST" ]; then
210 echo "Usage: $0 <ksrc directory> <kobj directory> <output.tar.gz>" >&2
211 exit 1
212fi
213
214KSRC=$(cd $KSRC; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
215KOBJ=$(cd $KOBJ; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
216
217find $KSRC $KOBJ \( -name '*.gcno' -o -name '*.[ch]' -o -type l \) -a \
218 -perm /u+r,g+r | tar cfz $DEST -P -T -
219
220if [ $? -eq 0 ] ; then
221 echo "$DEST successfully created, copy to test system and unpack with:"
222 echo " tar xfz $DEST -P"
223else
224 echo "Could not create file $DEST"
225fi
226
227
228Appendix B: gather_on_test.sh
229=============================
230
231Sample script to gather coverage data files on the test machine
232(see 6b):
233
234#!/bin/bash -e
235
236DEST=$1
237GCDA=/sys/kernel/debug/gcov
238
239if [ -z "$DEST" ] ; then
240 echo "Usage: $0 <output.tar.gz>" >&2
241 exit 1
242fi
243
244TEMPDIR=$(mktemp -d)
245echo Collecting data..
246find $GCDA -type d -exec mkdir -p $TEMPDIR/\{\} \;
247find $GCDA -name '*.gcda' -exec sh -c 'cat < $0 > '$TEMPDIR'/$0' {} \;
248find $GCDA -name '*.gcno' -exec sh -c 'cp -d $0 '$TEMPDIR'/$0' {} \;
249tar czf $DEST -C $TEMPDIR sys
250rm -rf $TEMPDIR
251
252echo "$DEST successfully created, copy to build system and unpack with:"
253echo " tar xfz $DEST"
diff --git a/Documentation/hwmon/pcf8591 b/Documentation/hwmon/pcf8591
index 5628fcf4207f..e76a7892f68e 100644
--- a/Documentation/hwmon/pcf8591
+++ b/Documentation/hwmon/pcf8591
@@ -2,11 +2,11 @@ Kernel driver pcf8591
2===================== 2=====================
3 3
4Supported chips: 4Supported chips:
5 * Philips PCF8591 5 * Philips/NXP PCF8591
6 Prefix: 'pcf8591' 6 Prefix: 'pcf8591'
7 Addresses scanned: I2C 0x48 - 0x4f 7 Addresses scanned: I2C 0x48 - 0x4f
8 Datasheet: Publicly available at the Philips Semiconductor website 8 Datasheet: Publicly available at the NXP website
9 http://www.semiconductors.philips.com/pip/PCF8591P.html 9 http://www.nxp.com/pip/PCF8591_6.html
10 10
11Authors: 11Authors:
12 Aurelien Jarno <aurelien@aurel32.net> 12 Aurelien Jarno <aurelien@aurel32.net>
@@ -16,9 +16,10 @@ Authors:
16 16
17Description 17Description
18----------- 18-----------
19
19The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one 20The PCF8591 is an 8-bit A/D and D/A converter (4 analog inputs and one
20analog output) for the I2C bus produced by Philips Semiconductors. It 21analog output) for the I2C bus produced by Philips Semiconductors (now NXP).
21is designed to provide a byte I2C interface to up to 4 separate devices. 22It is designed to provide a byte I2C interface to up to 4 separate devices.
22 23
23The PCF8591 has 4 analog inputs programmable as single-ended or 24The PCF8591 has 4 analog inputs programmable as single-ended or
24differential inputs : 25differential inputs :
@@ -58,8 +59,8 @@ Accessing PCF8591 via /sys interface
58------------------------------------- 59-------------------------------------
59 60
60! Be careful ! 61! Be careful !
61The PCF8591 is plainly impossible to detect ! Stupid chip. 62The PCF8591 is plainly impossible to detect! Stupid chip.
62So every chip with address in the interval [48..4f] is 63So every chip with address in the interval [0x48..0x4f] is
63detected as PCF8591. If you have other chips in this address 64detected as PCF8591. If you have other chips in this address
64range, the workaround is to load this module after the one 65range, the workaround is to load this module after the one
65for your others chips. 66for your others chips.
@@ -67,19 +68,20 @@ for your others chips.
67On detection (i.e. insmod, modprobe et al.), directories are being 68On detection (i.e. insmod, modprobe et al.), directories are being
68created for each detected PCF8591: 69created for each detected PCF8591:
69 70
70/sys/bus/devices/<0>-<1>/ 71/sys/bus/i2c/devices/<0>-<1>/
71where <0> is the bus the chip was detected on (e. g. i2c-0) 72where <0> is the bus the chip was detected on (e. g. i2c-0)
72and <1> the chip address ([48..4f]) 73and <1> the chip address ([48..4f])
73 74
74Inside these directories, there are such files: 75Inside these directories, there are such files:
75in0, in1, in2, in3, out0_enable, out0_output, name 76in0_input, in1_input, in2_input, in3_input, out0_enable, out0_output, name
76 77
77Name contains chip name. 78Name contains chip name.
78 79
79The in0, in1, in2 and in3 files are RO. Reading gives the value of the 80The in0_input, in1_input, in2_input and in3_input files are RO. Reading gives
80corresponding channel. Depending on the current analog inputs configuration, 81the value of the corresponding channel. Depending on the current analog inputs
81files in2 and/or in3 do not exist. Values range are from 0 to 255 for single 82configuration, files in2_input and in3_input may not exist. Values range
82ended inputs and -128 to +127 for differential inputs (8-bit ADC). 83from 0 to 255 for single ended inputs and -128 to +127 for differential inputs
84(8-bit ADC).
83 85
84The out0_enable file is RW. Reading gives "1" for analog output enabled and 86The out0_enable file is RW. Reading gives "1" for analog output enabled and
85"0" for analog output disabled. Writing accepts "0" and "1" accordingly. 87"0" for analog output disabled. Writing accepts "0" and "1" accordingly.
diff --git a/Documentation/hwmon/tmp421 b/Documentation/hwmon/tmp421
new file mode 100644
index 000000000000..0cf07f824741
--- /dev/null
+++ b/Documentation/hwmon/tmp421
@@ -0,0 +1,36 @@
1Kernel driver tmp421
2====================
3
4Supported chips:
5 * Texas Instruments TMP421
6 Prefix: 'tmp421'
7 Addresses scanned: I2C 0x2a, 0x4c, 0x4d, 0x4e and 0x4f
8 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp421.html
9 * Texas Instruments TMP422
10 Prefix: 'tmp422'
11 Addresses scanned: I2C 0x2a, 0x4c, 0x4d, 0x4e and 0x4f
12 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp421.html
13 * Texas Instruments TMP423
14 Prefix: 'tmp423'
15 Addresses scanned: I2C 0x2a, 0x4c, 0x4d, 0x4e and 0x4f
16 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp421.html
17
18Authors:
19 Andre Prendel <andre.prendel@gmx.de>
20
21Description
22-----------
23
24This driver implements support for Texas Instruments TMP421, TMP422
25and TMP423 temperature sensor chips. These chips implement one local
26and up to one (TMP421), up to two (TMP422) or up to three (TMP423)
27remote sensors. Temperature is measured in degrees Celsius. The chips
28are wired over I2C/SMBus and specified over a temperature range of -40
29to +125 degrees Celsius. Resolution for both the local and remote
30channels is 0.0625 degree C.
31
32The chips support only temperature measurement. The driver exports
33the temperature values via the following sysfs files:
34
35temp[1-4]_input
36temp[2-4]_fault
diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index b55ce57a84db..c740b7b41088 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -165,3 +165,47 @@ was done there. Two significant differences are:
165Once again, method 3 should be avoided wherever possible. Explicit device 165Once again, method 3 should be avoided wherever possible. Explicit device
166instantiation (methods 1 and 2) is much preferred for it is safer and 166instantiation (methods 1 and 2) is much preferred for it is safer and
167faster. 167faster.
168
169
170Method 4: Instantiate from user-space
171-------------------------------------
172
173In general, the kernel should know which I2C devices are connected and
174what addresses they live at. However, in certain cases, it does not, so a
175sysfs interface was added to let the user provide the information. This
176interface is made of 2 attribute files which are created in every I2C bus
177directory: new_device and delete_device. Both files are write only and you
178must write the right parameters to them in order to properly instantiate,
179respectively delete, an I2C device.
180
181File new_device takes 2 parameters: the name of the I2C device (a string)
182and the address of the I2C device (a number, typically expressed in
183hexadecimal starting with 0x, but can also be expressed in decimal.)
184
185File delete_device takes a single parameter: the address of the I2C
186device. As no two devices can live at the same address on a given I2C
187segment, the address is sufficient to uniquely identify the device to be
188deleted.
189
190Example:
191# echo eeprom 0x50 > /sys/class/i2c-adapter/i2c-3/new_device
192
193While this interface should only be used when in-kernel device declaration
194can't be done, there is a variety of cases where it can be helpful:
195* The I2C driver usually detects devices (method 3 above) but the bus
196 segment your device lives on doesn't have the proper class bit set and
197 thus detection doesn't trigger.
198* The I2C driver usually detects devices, but your device lives at an
199 unexpected address.
200* The I2C driver usually detects devices, but your device is not detected,
201 either because the detection routine is too strict, or because your
202 device is not officially supported yet but you know it is compatible.
203* You are developing a driver on a test board, where you soldered the I2C
204 device yourself.
205
206This interface is a replacement for the force_* module parameters some I2C
207drivers implement. Being implemented in i2c-core rather than in each
208device driver individually, it is much more efficient, and also has the
209advantage that you do not have to reload the driver to change a setting.
210You can also instantiate the device before the driver is loaded or even
211available, and you don't need to know what driver the device needs.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index c1a06f989cf7..7860aafb483d 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -126,19 +126,9 @@ different) configuration information, as do drivers handling chip variants
126that can't be distinguished by protocol probing, or which need some board 126that can't be distinguished by protocol probing, or which need some board
127specific information to operate correctly. 127specific information to operate correctly.
128 128
129Accordingly, the I2C stack now has two models for associating I2C devices
130with their drivers: the original "legacy" model, and a newer one that's
131fully compatible with the Linux 2.6 driver model. These models do not mix,
132since the "legacy" model requires drivers to create "i2c_client" device
133objects after SMBus style probing, while the Linux driver model expects
134drivers to be given such device objects in their probe() routines.
135 129
136The legacy model is deprecated now and will soon be removed, so we no 130Device/Driver Binding
137longer document it here. 131---------------------
138
139
140Standard Driver Model Binding ("New Style")
141-------------------------------------------
142 132
143System infrastructure, typically board-specific initialization code or 133System infrastructure, typically board-specific initialization code or
144boot firmware, reports what I2C devices exist. For example, there may be 134boot firmware, reports what I2C devices exist. For example, there may be
@@ -201,7 +191,7 @@ a given I2C bus. This is for example the case of hardware monitoring
201devices on a PC's SMBus. In that case, you may want to let your driver 191devices on a PC's SMBus. In that case, you may want to let your driver
202detect supported devices automatically. This is how the legacy model 192detect supported devices automatically. This is how the legacy model
203was working, and is now available as an extension to the standard 193was working, and is now available as an extension to the standard
204driver model (so that we can finally get rid of the legacy model.) 194driver model.
205 195
206You simply have to define a detect callback which will attempt to 196You simply have to define a detect callback which will attempt to
207identify supported devices (returning 0 for supported ones and -ENODEV 197identify supported devices (returning 0 for supported ones and -ENODEV
diff --git a/Documentation/input/input.txt b/Documentation/input/input.txt
index 686ee9932dff..b93c08442e3c 100644
--- a/Documentation/input/input.txt
+++ b/Documentation/input/input.txt
@@ -278,7 +278,7 @@ struct input_event {
278}; 278};
279 279
280 'time' is the timestamp, it returns the time at which the event happened. 280 'time' is the timestamp, it returns the time at which the event happened.
281Type is for example EV_REL for relative moment, REL_KEY for a keypress or 281Type is for example EV_REL for relative moment, EV_KEY for a keypress or
282release. More types are defined in include/linux/input.h. 282release. More types are defined in include/linux/input.h.
283 283
284 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete 284 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
index 435102a26d96..3a6aec40c0b0 100644
--- a/Documentation/input/rotary-encoder.txt
+++ b/Documentation/input/rotary-encoder.txt
@@ -67,7 +67,12 @@ data with it.
67struct rotary_encoder_platform_data is declared in 67struct rotary_encoder_platform_data is declared in
68include/linux/rotary-encoder.h and needs to be filled with the number of 68include/linux/rotary-encoder.h and needs to be filled with the number of
69steps the encoder has and can carry information about externally inverted 69steps the encoder has and can carry information about externally inverted
70signals (because of used invertig buffer or other reasons). 70signals (because of an inverting buffer or other reasons). The encoder
71can be set up to deliver input information as either an absolute or relative
72axes. For relative axes the input event returns +/-1 for each step. For
73absolute axes the position of the encoder can either roll over between zero
74and the number of steps or will clamp at the maximum and zero depending on
75the configuration.
71 76
72Because GPIO to IRQ mapping is platform specific, this information must 77Because GPIO to IRQ mapping is platform specific, this information must
73be given in seperately to the driver. See the example below. 78be given in seperately to the driver. See the example below.
@@ -85,6 +90,8 @@ be given in seperately to the driver. See the example below.
85static struct rotary_encoder_platform_data my_rotary_encoder_info = { 90static struct rotary_encoder_platform_data my_rotary_encoder_info = {
86 .steps = 24, 91 .steps = 24,
87 .axis = ABS_X, 92 .axis = ABS_X,
93 .relative_axis = false,
94 .rollover = false,
88 .gpio_a = GPIO_ROTARY_A, 95 .gpio_a = GPIO_ROTARY_A,
89 .gpio_b = GPIO_ROTARY_B, 96 .gpio_b = GPIO_ROTARY_B,
90 .inverted_a = 0, 97 .inverted_a = 0,
diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt
new file mode 100644
index 000000000000..f7160a2fb6a2
--- /dev/null
+++ b/Documentation/input/sentelic.txt
@@ -0,0 +1,475 @@
1Copyright (C) 2002-2008 Sentelic Corporation.
2Last update: Oct-31-2008
3
4==============================================================================
5* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons)
6==============================================================================
7A) MSID 4: Scrolling wheel mode plus Forward page(4th button) and Backward
8 page (5th button)
9@1. Set sample rate to 200;
10@2. Set sample rate to 200;
11@3. Set sample rate to 80;
12@4. Issuing the "Get device ID" command (0xF2) and waits for the response;
13@5. FSP will respond 0x04.
14
15Packet 1
16 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
17BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
18 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|W|W|W|W|
19 |---------------| |---------------| |---------------| |---------------|
20
21Byte 1: Bit7 => Y overflow
22 Bit6 => X overflow
23 Bit5 => Y sign bit
24 Bit4 => X sign bit
25 Bit3 => 1
26 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
27 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
28 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
29Byte 2: X Movement(9-bit 2's complement integers)
30Byte 3: Y Movement(9-bit 2's complement integers)
31Byte 4: Bit3~Bit0 => the scrolling wheel's movement since the last data report.
32 valid values, -8 ~ +7
33 Bit4 => 1 = 4th mouse button is pressed, Forward one page.
34 0 = 4th mouse button is not pressed.
35 Bit5 => 1 = 5th mouse button is pressed, Backward one page.
36 0 = 5th mouse button is not pressed.
37
38B) MSID 6: Horizontal and Vertical scrolling.
39@ Set bit 1 in register 0x40 to 1
40
41# FSP replaces scrolling wheel's movement as 4 bits to show horizontal and
42 vertical scrolling.
43
44Packet 1
45 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
46BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
47 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|l|r|u|d|
48 |---------------| |---------------| |---------------| |---------------|
49
50Byte 1: Bit7 => Y overflow
51 Bit6 => X overflow
52 Bit5 => Y sign bit
53 Bit4 => X sign bit
54 Bit3 => 1
55 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
56 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
57 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
58Byte 2: X Movement(9-bit 2's complement integers)
59Byte 3: Y Movement(9-bit 2's complement integers)
60Byte 4: Bit0 => the Vertical scrolling movement downward.
61 Bit1 => the Vertical scrolling movement upward.
62 Bit2 => the Vertical scrolling movement rightward.
63 Bit3 => the Vertical scrolling movement leftward.
64 Bit4 => 1 = 4th mouse button is pressed, Forward one page.
65 0 = 4th mouse button is not pressed.
66 Bit5 => 1 = 5th mouse button is pressed, Backward one page.
67 0 = 5th mouse button is not pressed.
68
69C) MSID 7:
70# FSP uses 2 packets(8 Bytes) data to represent Absolute Position
71 so we have PACKET NUMBER to identify packets.
72 If PACKET NUMBER is 0, the packet is Packet 1.
73 If PACKET NUMBER is 1, the packet is Packet 2.
74 Please count this number in program.
75
76# MSID6 special packet will be enable at the same time when enable MSID 7.
77
78==============================================================================
79* Absolute position for STL3886-G0.
80==============================================================================
81@ Set bit 2 or 3 in register 0x40 to 1
82@ Set bit 6 in register 0x40 to 1
83
84Packet 1 (ABSOLUTE POSITION)
85 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
86BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
87 1 |0|1|V|1|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|d|u|X|X|Y|Y|
88 |---------------| |---------------| |---------------| |---------------|
89
90Byte 1: Bit7~Bit6 => 00, Normal data packet
91 => 01, Absolute coordination packet
92 => 10, Notify packet
93 Bit5 => valid bit
94 Bit4 => 1
95 Bit3 => 1
96 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
97 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
98 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
99Byte 2: X coordinate (xpos[9:2])
100Byte 3: Y coordinate (ypos[9:2])
101Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
102 Bit3~Bit2 => X coordinate (ypos[1:0])
103 Bit4 => scroll up
104 Bit5 => scroll down
105 Bit6 => scroll left
106 Bit7 => scroll right
107
108Notify Packet for G0
109 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
110BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
111 1 |1|0|0|1|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |M|M|M|M|M|M|M|M| 4 |0|0|0|0|0|0|0|0|
112 |---------------| |---------------| |---------------| |---------------|
113
114Byte 1: Bit7~Bit6 => 00, Normal data packet
115 => 01, Absolute coordination packet
116 => 10, Notify packet
117 Bit5 => 0
118 Bit4 => 1
119 Bit3 => 1
120 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
121 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
122 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
123Byte 2: Message Type => 0x5A (Enable/Disable status packet)
124 Mode Type => 0xA5 (Normal/Icon mode status)
125Byte 3: Message Type => 0x00 (Disabled)
126 => 0x01 (Enabled)
127 Mode Type => 0x00 (Normal)
128 => 0x01 (Icon)
129Byte 4: Bit7~Bit0 => Don't Care
130
131==============================================================================
132* Absolute position for STL3888-A0.
133==============================================================================
134Packet 1 (ABSOLUTE POSITION)
135 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
136BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
137 1 |0|1|V|A|1|L|0|1| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y|
138 |---------------| |---------------| |---------------| |---------------|
139
140Byte 1: Bit7~Bit6 => 00, Normal data packet
141 => 01, Absolute coordination packet
142 => 10, Notify packet
143 Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
144 When both fingers are up, the last two reports have zero valid
145 bit.
146 Bit4 => arc
147 Bit3 => 1
148 Bit2 => Left Button, 1 is pressed, 0 is released.
149 Bit1 => 0
150 Bit0 => 1
151Byte 2: X coordinate (xpos[9:2])
152Byte 3: Y coordinate (ypos[9:2])
153Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
154 Bit3~Bit2 => X coordinate (ypos[1:0])
155 Bit5~Bit4 => y1_g
156 Bit7~Bit6 => x1_g
157
158Packet 2 (ABSOLUTE POSITION)
159 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
160BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
161 1 |0|1|V|A|1|R|1|0| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y|
162 |---------------| |---------------| |---------------| |---------------|
163
164Byte 1: Bit7~Bit6 => 00, Normal data packet
165 => 01, Absolute coordinates packet
166 => 10, Notify packet
167 Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up.
168 When both fingers are up, the last two reports have zero valid
169 bit.
170 Bit4 => arc
171 Bit3 => 1
172 Bit2 => Right Button, 1 is pressed, 0 is released.
173 Bit1 => 1
174 Bit0 => 0
175Byte 2: X coordinate (xpos[9:2])
176Byte 3: Y coordinate (ypos[9:2])
177Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0])
178 Bit3~Bit2 => X coordinate (ypos[1:0])
179 Bit5~Bit4 => y2_g
180 Bit7~Bit6 => x2_g
181
182Notify Packet for STL3888-A0
183 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
184BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
185 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0|
186 |---------------| |---------------| |---------------| |---------------|
187
188Byte 1: Bit7~Bit6 => 00, Normal data packet
189 => 01, Absolute coordination packet
190 => 10, Notify packet
191 Bit5 => 1
192 Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1):
193 0: left button is generated by the on-pad command
194 1: left button is generated by the external button
195 Bit3 => 1
196 Bit2 => Middle Button, 1 is pressed, 0 is not pressed.
197 Bit1 => Right Button, 1 is pressed, 0 is not pressed.
198 Bit0 => Left Button, 1 is pressed, 0 is not pressed.
199Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode)
200Byte 3: Bit7~Bit6 => Don't care
201 Bit5~Bit4 => Number of fingers
202 Bit3~Bit1 => Reserved
203 Bit0 => 1: enter gesture mode; 0: leaving gesture mode
204Byte 4: Bit7 => scroll right button
205 Bit6 => scroll left button
206 Bit5 => scroll down button
207 Bit4 => scroll up button
208 * Note that if gesture and additional button (Bit4~Bit7)
209 happen at the same time, the button information will not
210 be sent.
211 Bit3~Bit0 => Reserved
212
213Sample sequence of Multi-finger, Multi-coordinate mode:
214
215 notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1,
216 abs pkt 2, ..., notify packet(valid bit == 0)
217
218==============================================================================
219* FSP Enable/Disable packet
220==============================================================================
221 Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
222BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------|
223 1 |Y|X|0|0|1|M|R|L| 2 |0|1|0|1|1|0|1|E| 3 | | | | | | | | | 4 | | | | | | | | |
224 |---------------| |---------------| |---------------| |---------------|
225
226FSP will send out enable/disable packet when FSP receive PS/2 enable/disable
227command. Host will receive the packet which Middle, Right, Left button will
228be set. The packet only use byte 0 and byte 1 as a pattern of original packet.
229Ignore the other bytes of the packet.
230
231Byte 1: Bit7 => 0, Y overflow
232 Bit6 => 0, X overflow
233 Bit5 => 0, Y sign bit
234 Bit4 => 0, X sign bit
235 Bit3 => 1
236 Bit2 => 1, Middle Button
237 Bit1 => 1, Right Button
238 Bit0 => 1, Left Button
239Byte 2: Bit7~1 => (0101101b)
240 Bit0 => 1 = Enable
241 0 = Disable
242Byte 3: Don't care
243Byte 4: Don't care (MOUSE ID 3, 4)
244Byte 5~8: Don't care (Absolute packet)
245
246==============================================================================
247* PS/2 Command Set
248==============================================================================
249
250FSP supports basic PS/2 commanding set and modes, refer to following URL for
251details about PS/2 commands:
252
253http://www.computer-engineering.org/index.php?title=PS/2_Mouse_Interface
254
255==============================================================================
256* Programming Sequence for Determining Packet Parsing Flow
257==============================================================================
2581. Identify FSP by reading device ID(0x00) and version(0x01) register
259
2602. Determine number of buttons by reading status2 (0x0b) register
261
262 buttons = reg[0x0b] & 0x30
263
264 if buttons == 0x30 or buttons == 0x20:
265 # two/four buttons
266 Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
267 section A for packet parsing detail(ignore byte 4, bit ~ 7)
268 elif buttons == 0x10:
269 # 6 buttons
270 Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
271 section B for packet parsing detail
272 elif buttons == 0x00:
273 # 6 buttons
274 Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse'
275 section A for packet parsing detail
276
277==============================================================================
278* Programming Sequence for Register Reading/Writing
279==============================================================================
280
281Register inversion requirement:
282
283 Following values needed to be inverted(the '~' operator in C) before being
284sent to FSP:
285
286 0xe9, 0xee, 0xf2 and 0xff.
287
288Register swapping requirement:
289
290 Following values needed to have their higher 4 bits and lower 4 bits being
291swapped before being sent to FSP:
292
293 10, 20, 40, 60, 80, 100 and 200.
294
295Register reading sequence:
296
297 1. send 0xf3 PS/2 command to FSP;
298
299 2. send 0x66 PS/2 command to FSP;
300
301 3. send 0x88 PS/2 command to FSP;
302
303 4. send 0xf3 PS/2 command to FSP;
304
305 5. if the register address being to read is not required to be
306 inverted(refer to the 'Register inversion requirement' section),
307 goto step 6
308
309 5a. send 0x68 PS/2 command to FSP;
310
311 5b. send the inverted register address to FSP and goto step 8;
312
313 6. if the register address being to read is not required to be
314 swapped(refer to the 'Register swapping requirement' section),
315 goto step 7
316
317 6a. send 0xcc PS/2 command to FSP;
318
319 6b. send the swapped register address to FSP and goto step 8;
320
321 7. send 0x66 PS/2 command to FSP;
322
323 7a. send the original register address to FSP and goto step 8;
324
325 8. send 0xe9(status request) PS/2 command to FSP;
326
327 9. the response read from FSP should be the requested register value.
328
329Register writing sequence:
330
331 1. send 0xf3 PS/2 command to FSP;
332
333 2. if the register address being to write is not required to be
334 inverted(refer to the 'Register inversion requirement' section),
335 goto step 3
336
337 2a. send 0x74 PS/2 command to FSP;
338
339 2b. send the inverted register address to FSP and goto step 5;
340
341 3. if the register address being to write is not required to be
342 swapped(refer to the 'Register swapping requirement' section),
343 goto step 4
344
345 3a. send 0x77 PS/2 command to FSP;
346
347 3b. send the swapped register address to FSP and goto step 5;
348
349 4. send 0x55 PS/2 command to FSP;
350
351 4a. send the register address to FSP and goto step 5;
352
353 5. send 0xf3 PS/2 command to FSP;
354
355 6. if the register value being to write is not required to be
356 inverted(refer to the 'Register inversion requirement' section),
357 goto step 7
358
359 6a. send 0x47 PS/2 command to FSP;
360
361 6b. send the inverted register value to FSP and goto step 9;
362
363 7. if the register value being to write is not required to be
364 swapped(refer to the 'Register swapping requirement' section),
365 goto step 8
366
367 7a. send 0x44 PS/2 command to FSP;
368
369 7b. send the swapped register value to FSP and goto step 9;
370
371 8. send 0x33 PS/2 command to FSP;
372
373 8a. send the register value to FSP;
374
375 9. the register writing sequence is completed.
376
377==============================================================================
378* Register Listing
379==============================================================================
380
381offset width default r/w name
3820x00 bit7~bit0 0x01 RO device ID
383
3840x01 bit7~bit0 0xc0 RW version ID
385
3860x02 bit7~bit0 0x01 RO vendor ID
387
3880x03 bit7~bit0 0x01 RO product ID
389
3900x04 bit3~bit0 0x01 RW revision ID
391
3920x0b RO test mode status 1
393 bit3 1 RO 0: rotate 180 degree, 1: no rotation
394
395 bit5~bit4 RO number of buttons
396 11 => 2, lbtn/rbtn
397 10 => 4, lbtn/rbtn/scru/scrd
398 01 => 6, lbtn/rbtn/scru/scrd/scrl/scrr
399 00 => 6, lbtn/rbtn/scru/scrd/fbtn/bbtn
400
4010x0f RW register file page control
402 bit0 0 RW 1 to enable page 1 register files
403
4040x10 RW system control 1
405 bit0 1 RW Reserved, must be 1
406 bit1 0 RW Reserved, must be 0
407 bit4 1 RW Reserved, must be 0
408 bit5 0 RW register clock gating enable
409 0: read only, 1: read/write enable
410 (Note that following registers does not require clock gating being
411 enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e
412 40 41 42 43.)
413
4140x31 RW on-pad command detection
415 bit7 0 RW on-pad command left button down tag
416 enable
417 0: disable, 1: enable
418
4190x34 RW on-pad command control 5
420 bit4~bit0 0x05 RW XLO in 0s/4/1, so 03h = 0010.1b = 2.5
421 (Note that position unit is in 0.5 scanline)
422
423 bit7 0 RW on-pad tap zone enable
424 0: disable, 1: enable
425
4260x35 RW on-pad command control 6
427 bit4~bit0 0x1d RW XHI in 0s/4/1, so 19h = 1100.1b = 12.5
428 (Note that position unit is in 0.5 scanline)
429
4300x36 RW on-pad command control 7
431 bit4~bit0 0x04 RW YLO in 0s/4/1, so 03h = 0010.1b = 2.5
432 (Note that position unit is in 0.5 scanline)
433
4340x37 RW on-pad command control 8
435 bit4~bit0 0x13 RW YHI in 0s/4/1, so 11h = 1000.1b = 8.5
436 (Note that position unit is in 0.5 scanline)
437
4380x40 RW system control 5
439 bit1 0 RW FSP Intellimouse mode enable
440 0: disable, 1: enable
441
442 bit2 0 RW movement + abs. coordinate mode enable
443 0: disable, 1: enable
444 (Note that this function has the functionality of bit 1 even when
445 bit 1 is not set. However, the format is different from that of bit 1.
446 In addition, when bit 1 and bit 2 are set at the same time, bit 2 will
447 override bit 1.)
448
449 bit3 0 RW abs. coordinate only mode enable
450 0: disable, 1: enable
451 (Note that this function has the functionality of bit 1 even when
452 bit 1 is not set. However, the format is different from that of bit 1.
453 In addition, when bit 1, bit 2 and bit 3 are set at the same time,
454 bit 3 will override bit 1 and 2.)
455
456 bit5 0 RW auto switch enable
457 0: disable, 1: enable
458
459 bit6 0 RW G0 abs. + notify packet format enable
460 0: disable, 1: enable
461 (Note that the absolute/relative coordinate output still depends on
462 bit 2 and 3. That is, if any of those bit is 1, host will receive
463 absolute coordinates; otherwise, host only receives packets with
464 relative coordinate.)
465
4660x43 RW on-pad control
467 bit0 0 RW on-pad control enable
468 0: disable, 1: enable
469 (Note that if this bit is cleared, bit 3/5 will be ineffective)
470
471 bit3 0 RW on-pad fix vertical scrolling enable
472 0: disable, 1: enable
473
474 bit5 0 RW on-pad fix horizontal scrolling enable
475 0: disable, 1: enable
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
new file mode 100644
index 000000000000..f40a1f030019
--- /dev/null
+++ b/Documentation/intel_txt.txt
@@ -0,0 +1,210 @@
1Intel(R) TXT Overview:
2=====================
3
4Intel's technology for safer computing, Intel(R) Trusted Execution
5Technology (Intel(R) TXT), defines platform-level enhancements that
6provide the building blocks for creating trusted platforms.
7
8Intel TXT was formerly known by the code name LaGrande Technology (LT).
9
10Intel TXT in Brief:
11o Provides dynamic root of trust for measurement (DRTM)
12o Data protection in case of improper shutdown
13o Measurement and verification of launched environment
14
15Intel TXT is part of the vPro(TM) brand and is also available some
16non-vPro systems. It is currently available on desktop systems
17based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell
18Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45,
19PM45, and GS45 Express chipsets.
20
21For more information, see http://www.intel.com/technology/security/.
22This site also has a link to the Intel TXT MLE Developers Manual,
23which has been updated for the new released platforms.
24
25Intel TXT has been presented at various events over the past few
26years, some of which are:
27 LinuxTAG 2008:
28 http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/
29 details.html?talkid=110
30 TRUST2008:
31 http://www.trust2008.eu/downloads/Keynote-Speakers/
32 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf
33 IDF 2008, Shanghai:
34 http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/
35 aep/PROS003/index.html
36 IDFs 2006, 2007 (I'm not sure if/where they are online)
37
38Trusted Boot Project Overview:
39=============================
40
41Trusted Boot (tboot) is an open source, pre- kernel/VMM module that
42uses Intel TXT to perform a measured and verified launch of an OS
43kernel/VMM.
44
45It is hosted on SourceForge at http://sourceforge.net/projects/tboot.
46The mercurial source repo is available at http://www.bughost.org/
47repos.hg/tboot.hg.
48
49Tboot currently supports launching Xen (open source VMM/hypervisor
50w/ TXT support since v3.2), and now Linux kernels.
51
52
53Value Proposition for Linux or "Why should you care?"
54=====================================================
55
56While there are many products and technologies that attempt to
57measure or protect the integrity of a running kernel, they all
58assume the kernel is "good" to begin with. The Integrity
59Measurement Architecture (IMA) and Linux Integrity Module interface
60are examples of such solutions.
61
62To get trust in the initial kernel without using Intel TXT, a
63static root of trust must be used. This bases trust in BIOS
64starting at system reset and requires measurement of all code
65executed between system reset through the completion of the kernel
66boot as well as data objects used by that code. In the case of a
67Linux kernel, this means all of BIOS, any option ROMs, the
68bootloader and the boot config. In practice, this is a lot of
69code/data, much of which is subject to change from boot to boot
70(e.g. changing NICs may change option ROMs). Without reference
71hashes, these measurement changes are difficult to assess or
72confirm as benign. This process also does not provide DMA
73protection, memory configuration/alias checks and locks, crash
74protection, or policy support.
75
76By using the hardware-based root of trust that Intel TXT provides,
77many of these issues can be mitigated. Specifically: many
78pre-launch components can be removed from the trust chain, DMA
79protection is provided to all launched components, a large number
80of platform configuration checks are performed and values locked,
81protection is provided for any data in the event of an improper
82shutdown, and there is support for policy-based execution/verification.
83This provides a more stable measurement and a higher assurance of
84system configuration and initial state than would be otherwise
85possible. Since the tboot project is open source, source code for
86almost all parts of the trust chain is available (excepting SMM and
87Intel-provided firmware).
88
89How Does it Work?
90=================
91
92o Tboot is an executable that is launched by the bootloader as
93 the "kernel" (the binary the bootloader executes).
94o It performs all of the work necessary to determine if the
95 platform supports Intel TXT and, if so, executes the GETSEC[SENTER]
96 processor instruction that initiates the dynamic root of trust.
97 - If tboot determines that the system does not support Intel TXT
98 or is not configured correctly (e.g. the SINIT AC Module was
99 incorrect), it will directly launch the kernel with no changes
100 to any state.
101 - Tboot will output various information about its progress to the
102 terminal, serial port, and/or an in-memory log; the output
103 locations can be configured with a command line switch.
104o The GETSEC[SENTER] instruction will return control to tboot and
105 tboot then verifies certain aspects of the environment (e.g. TPM NV
106 lock, e820 table does not have invalid entries, etc.).
107o It will wake the APs from the special sleep state the GETSEC[SENTER]
108 instruction had put them in and place them into a wait-for-SIPI
109 state.
110 - Because the processors will not respond to an INIT or SIPI when
111 in the TXT environment, it is necessary to create a small VT-x
112 guest for the APs. When they run in this guest, they will
113 simply wait for the INIT-SIPI-SIPI sequence, which will cause
114 VMEXITs, and then disable VT and jump to the SIPI vector. This
115 approach seemed like a better choice than having to insert
116 special code into the kernel's MP wakeup sequence.
117o Tboot then applies an (optional) user-defined launch policy to
118 verify the kernel and initrd.
119 - This policy is rooted in TPM NV and is described in the tboot
120 project. The tboot project also contains code for tools to
121 create and provision the policy.
122 - Policies are completely under user control and if not present
123 then any kernel will be launched.
124 - Policy action is flexible and can include halting on failures
125 or simply logging them and continuing.
126o Tboot adjusts the e820 table provided by the bootloader to reserve
127 its own location in memory as well as to reserve certain other
128 TXT-related regions.
129o As part of it's launch, tboot DMA protects all of RAM (using the
130 VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on'
131 in order to remove this blanket protection and use VT-d's
132 page-level protection.
133o Tboot will populate a shared page with some data about itself and
134 pass this to the Linux kernel as it transfers control.
135 - The location of the shared page is passed via the boot_params
136 struct as a physical address.
137o The kernel will look for the tboot shared page address and, if it
138 exists, map it.
139o As one of the checks/protections provided by TXT, it makes a copy
140 of the VT-d DMARs in a DMA-protected region of memory and verifies
141 them for correctness. The VT-d code will detect if the kernel was
142 launched with tboot and use this copy instead of the one in the
143 ACPI table.
144o At this point, tboot and TXT are out of the picture until a
145 shutdown (S<n>)
146o In order to put a system into any of the sleep states after a TXT
147 launch, TXT must first be exited. This is to prevent attacks that
148 attempt to crash the system to gain control on reboot and steal
149 data left in memory.
150 - The kernel will perform all of its sleep preparation and
151 populate the shared page with the ACPI data needed to put the
152 platform in the desired sleep state.
153 - Then the kernel jumps into tboot via the vector specified in the
154 shared page.
155 - Tboot will clean up the environment and disable TXT, then use the
156 kernel-provided ACPI information to actually place the platform
157 into the desired sleep state.
158 - In the case of S3, tboot will also register itself as the resume
159 vector. This is necessary because it must re-establish the
160 measured environment upon resume. Once the TXT environment
161 has been restored, it will restore the TPM PCRs and then
162 transfer control back to the kernel's S3 resume vector.
163 In order to preserve system integrity across S3, the kernel
164 provides tboot with a set of memory ranges (kernel
165 code/data/bss, S3 resume code, and AP trampoline) that tboot
166 will calculate a MAC (message authentication code) over and then
167 seal with the TPM. On resume and once the measured environment
168 has been re-established, tboot will re-calculate the MAC and
169 verify it against the sealed value. Tboot's policy determines
170 what happens if the verification fails.
171
172That's pretty much it for TXT support.
173
174
175Configuring the System:
176======================
177
178This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels.
179
180In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes
181allow these to be individually enabled/disabled and the screens in
182which to find them are BIOS-specific.
183
184grub.conf needs to be modified as follows:
185 title Linux 2.6.29-tip w/ tboot
186 root (hd0,0)
187 kernel /tboot.gz logging=serial,vga,memory
188 module /vmlinuz-2.6.29-tip intel_iommu=on ro
189 root=LABEL=/ rhgb console=ttyS0,115200 3
190 module /initrd-2.6.29-tip.img
191 module /Q35_SINIT_17.BIN
192
193The kernel option for enabling Intel TXT support is found under the
194Security top-level menu and is called "Enable Intel(R) Trusted
195Execution Technology (TXT)". It is marked as EXPERIMENTAL and
196depends on the generic x86 support (to allow maximum flexibility in
197kernel build options), since the tboot code will detect whether the
198platform actually supports Intel TXT and thus whether any of the
199kernel code is executed.
200
201The Q35_SINIT_17.BIN file is what Intel TXT refers to as an
202Authenticated Code Module. It is specific to the chipset in the
203system and can also be found on the Trusted Boot site. It is an
204(unencrypted) module signed by Intel that is used as part of the
205DRTM process to verify and configure the system. It is signed
206because it operates at a higher privilege level in the system than
207any other macrocode and its correct operation is critical to the
208establishment of the DRTM. The process for determining the correct
209SINIT ACM for a system is documented in the SINIT-guide.txt file
210that is on the tboot SourceForge site under the SINIT ACM downloads.
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1f779a25c703..aafca0a8f66a 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -121,6 +121,7 @@ Code Seq# Include File Comments
121'c' 00-7F linux/comstats.h conflict! 121'c' 00-7F linux/comstats.h conflict!
122'c' 00-7F linux/coda.h conflict! 122'c' 00-7F linux/coda.h conflict!
123'c' 80-9F arch/s390/include/asm/chsc.h 123'c' 80-9F arch/s390/include/asm/chsc.h
124'c' A0-AF arch/x86/include/asm/msr.h
124'd' 00-FF linux/char/drm/drm/h conflict! 125'd' 00-FF linux/char/drm/drm/h conflict!
125'd' F0-FF linux/digi1.h 126'd' F0-FF linux/digi1.h
126'e' all linux/digi1.h conflict! 127'e' all linux/digi1.h conflict!
@@ -139,6 +140,7 @@ Code Seq# Include File Comments
139'm' all linux/synclink.h conflict! 140'm' all linux/synclink.h conflict!
140'm' 00-1F net/irda/irmod.h conflict! 141'm' 00-1F net/irda/irmod.h conflict!
141'n' 00-7F linux/ncp_fs.h 142'n' 00-7F linux/ncp_fs.h
143'n' 80-8F linux/nilfs2_fs.h NILFS2
142'n' E0-FF video/matrox.h matroxfb 144'n' E0-FF video/matrox.h matroxfb
143'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 145'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
144'o' 00-03 include/mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps) 146'o' 00-03 include/mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps)
@@ -149,6 +151,8 @@ Code Seq# Include File Comments
149'p' 40-7F linux/nvram.h 151'p' 40-7F linux/nvram.h
150'p' 80-9F user-space parport 152'p' 80-9F user-space parport
151 <mailto:tim@cyberelk.net> 153 <mailto:tim@cyberelk.net>
154'p' a1-a4 linux/pps.h LinuxPPS
155 <mailto:giometti@linux.it>
152'q' 00-1F linux/serio.h 156'q' 00-1F linux/serio.h
153'q' 80-FF Internet PhoneJACK, Internet LineJACK 157'q' 80-FF Internet PhoneJACK, Internet LineJACK
154 <http://www.quicknet.net> 158 <http://www.quicknet.net>
@@ -189,7 +193,7 @@ Code Seq# Include File Comments
1890xAD 00 Netfilter device in development: 1930xAD 00 Netfilter device in development:
190 <mailto:rusty@rustcorp.com.au> 194 <mailto:rusty@rustcorp.com.au>
1910xAE all linux/kvm.h Kernel-based Virtual Machine 1950xAE all linux/kvm.h Kernel-based Virtual Machine
192 <mailto:kvm-devel@lists.sourceforge.net> 196 <mailto:kvm@vger.kernel.org>
1930xB0 all RATIO devices in development: 1970xB0 all RATIO devices in development:
194 <mailto:vgo@ratio.de> 198 <mailto:vgo@ratio.de>
1950xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca> 1990xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index f6010a536590..e87e336f590e 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -14,25 +14,14 @@ README
14 - general info on what you need and what to do for Linux ISDN. 14 - general info on what you need and what to do for Linux ISDN.
15README.FAQ 15README.FAQ
16 - general info for FAQ. 16 - general info for FAQ.
17README.audio
18 - info for running audio over ISDN.
19README.fax
20 - info for using Fax over ISDN.
21README.gigaset
22 - info on the drivers for Siemens Gigaset ISDN adapters.
23README.icn
24 - info on the ICN-ISDN-card and its driver.
25>>>>>>> 93af7aca44f0e82e67bda10a0fb73d383edcc8bd:Documentation/isdn/00-INDEX
26README.HiSax 17README.HiSax
27 - info on the HiSax driver which replaces the old teles. 18 - info on the HiSax driver which replaces the old teles.
19README.act2000
20 - info on driver for IBM ACT-2000 card.
28README.audio 21README.audio
29 - info for running audio over ISDN. 22 - info for running audio over ISDN.
30README.avmb1 23README.avmb1
31 - info on driver for AVM-B1 ISDN card. 24 - info on driver for AVM-B1 ISDN card.
32README.act2000
33 - info on driver for IBM ACT-2000 card.
34README.eicon
35 - info on driver for Eicon active cards.
36README.concap 25README.concap
37 - info on "CONCAP" encapsulation protocol interface used for X.25. 26 - info on "CONCAP" encapsulation protocol interface used for X.25.
38README.diversion 27README.diversion
@@ -59,7 +48,3 @@ README.x25
59 - info for running X.25 over ISDN. 48 - info for running X.25 over ISDN.
60syncPPP.FAQ 49syncPPP.FAQ
61 - frequently asked questions about running PPP over ISDN. 50 - frequently asked questions about running PPP over ISDN.
62README.hysdn
63 - info on driver for Hypercope active HYSDN cards
64README.mISDN
65 - info on the Modular ISDN subsystem (mISDN).
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 6c42e071d723..2df4576f1173 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -75,7 +75,7 @@ Linux カーãƒãƒ«ãƒ‘ッãƒæŠ•ç¨¿è€…å‘ã‘ãƒã‚§ãƒƒã‚¯ãƒªã‚¹ãƒˆ
75 ビルドã—ãŸä¸Šã€å‹•ä½œç¢ºèªã‚’è¡Œã£ã¦ãã ã•ã„。 75 ビルドã—ãŸä¸Šã€å‹•ä½œç¢ºèªã‚’è¡Œã£ã¦ãã ã•ã„。
76 76
7714: ã‚‚ã—パッãƒãŒãƒ‡ã‚£ã‚¹ã‚¯ã®I/O性能ãªã©ã«å½±éŸ¿ã‚’与ãˆã‚‹ã‚ˆã†ã§ã‚ã‚Œã°ã€ 7714: ã‚‚ã—パッãƒãŒãƒ‡ã‚£ã‚¹ã‚¯ã®I/O性能ãªã©ã«å½±éŸ¿ã‚’与ãˆã‚‹ã‚ˆã†ã§ã‚ã‚Œã°ã€
78 'CONFIG_LBD'オプションを有効ã«ã—ãŸå ´åˆã¨ç„¡åŠ¹ã«ã—ãŸå ´åˆã®ä¸¡æ–¹ã§ 78 'CONFIG_LBDAF'オプションを有効ã«ã—ãŸå ´åˆã¨ç„¡åŠ¹ã«ã—ãŸå ´åˆã®ä¸¡æ–¹ã§
79 テストを実施ã—ã¦ã¿ã¦ãã ã•ã„。 79 テストを実施ã—ã¦ã¿ã¦ãã ã•ã„。
80 80
8115: lockdepã®æ©Ÿèƒ½ã‚’å…¨ã¦æœ‰åŠ¹ã«ã—ãŸä¸Šã§ã€å…¨ã¦ã®ã‚³ãƒ¼ãƒ‰ãƒ‘スを評価ã—ã¦ãã ã•ã„。 8115: lockdepã®æ©Ÿèƒ½ã‚’å…¨ã¦æœ‰åŠ¹ã«ã—ãŸä¸Šã§ã€å…¨ã¦ã®ã‚³ãƒ¼ãƒ‰ãƒ‘スを評価ã—ã¦ãã ã•ã„。
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ad3800630772..f45d0d8e71d8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -48,6 +48,7 @@ parameter is applicable:
48 EFI EFI Partitioning (GPT) is enabled 48 EFI EFI Partitioning (GPT) is enabled
49 EIDE EIDE/ATAPI support is enabled. 49 EIDE EIDE/ATAPI support is enabled.
50 FB The frame buffer device is enabled. 50 FB The frame buffer device is enabled.
51 GCOV GCOV profiling is enabled.
51 HW Appropriate hardware is enabled. 52 HW Appropriate hardware is enabled.
52 IA-64 IA-64 architecture is enabled. 53 IA-64 IA-64 architecture is enabled.
53 IMA Integrity measurement architecture is enabled. 54 IMA Integrity measurement architecture is enabled.
@@ -56,6 +57,7 @@ parameter is applicable:
56 ISAPNP ISA PnP code is enabled. 57 ISAPNP ISA PnP code is enabled.
57 ISDN Appropriate ISDN support is enabled. 58 ISDN Appropriate ISDN support is enabled.
58 JOY Appropriate joystick support is enabled. 59 JOY Appropriate joystick support is enabled.
60 KVM Kernel Virtual Machine support is enabled.
59 LIBATA Libata driver is enabled 61 LIBATA Libata driver is enabled
60 LP Printer support is enabled. 62 LP Printer support is enabled.
61 LOOP Loopback device support is enabled. 63 LOOP Loopback device support is enabled.
@@ -228,14 +230,6 @@ and is between 256 and 4096 characters. It is defined in the file
228 to assume that this machine's pmtimer latches its value 230 to assume that this machine's pmtimer latches its value
229 and always returns good values. 231 and always returns good values.
230 232
231 acpi.power_nocheck= [HW,ACPI]
232 Format: 1/0 enable/disable the check of power state.
233 On some bogus BIOS the _PSC object/_STA object of
234 power resource can't return the correct device power
235 state. In such case it is unneccessary to check its
236 power state again in power transition.
237 1 : disable the power state check
238
239 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode 233 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
240 Format: { level | edge | high | low } 234 Format: { level | edge | high | low }
241 235
@@ -546,6 +540,10 @@ and is between 256 and 4096 characters. It is defined in the file
546 console=brl,ttyS0 540 console=brl,ttyS0
547 For now, only VisioBraille is supported. 541 For now, only VisioBraille is supported.
548 542
543 consoleblank= [KNL] The console blank (screen saver) timeout in
544 seconds. Defaults to 10*60 = 10mins. A value of 0
545 disables the blank timer.
546
549 coredump_filter= 547 coredump_filter=
550 [KNL] Change the default value for 548 [KNL] Change the default value for
551 /proc/<pid>/coredump_filter. 549 /proc/<pid>/coredump_filter.
@@ -792,6 +790,12 @@ and is between 256 and 4096 characters. It is defined in the file
792 Format: off | on 790 Format: off | on
793 default: on 791 default: on
794 792
793 gcov_persist= [GCOV] When non-zero (default), profiling data for
794 kernel modules is saved and remains accessible via
795 debugfs, even when the module is unloaded/reloaded.
796 When zero, profiling data is discarded and associated
797 debugfs files are removed at module unload time.
798
795 gdth= [HW,SCSI] 799 gdth= [HW,SCSI]
796 See header of drivers/scsi/gdth.c. 800 See header of drivers/scsi/gdth.c.
797 801
@@ -995,6 +999,7 @@ and is between 256 and 4096 characters. It is defined in the file
995 nomerge 999 nomerge
996 forcesac 1000 forcesac
997 soft 1001 soft
1002 pt [x86, IA64]
998 1003
999 io7= [HW] IO7 for Marvel based alpha systems 1004 io7= [HW] IO7 for Marvel based alpha systems
1000 See comment before marvel_specify_io7 in 1005 See comment before marvel_specify_io7 in
@@ -1094,6 +1099,44 @@ and is between 256 and 4096 characters. It is defined in the file
1094 kstack=N [X86] Print N words from the kernel stack 1099 kstack=N [X86] Print N words from the kernel stack
1095 in oops dumps. 1100 in oops dumps.
1096 1101
1102 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
1103 Default is 0 (don't ignore, but inject #GP)
1104
1105 kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
1106 Default is 1 (enabled)
1107
1108 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
1109 Default is 0 (off)
1110
1111 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU)
1112 for all guests.
1113 Default is 1 (enabled) if in 64bit or 32bit-PAE mode
1114
1115 kvm-intel.bypass_guest_pf=
1116 [KVM,Intel] Disables bypassing of guest page faults
1117 on Intel chips. Default is 1 (enabled)
1118
1119 kvm-intel.ept= [KVM,Intel] Disable extended page tables
1120 (virtualized MMU) support on capable Intel chips.
1121 Default is 1 (enabled)
1122
1123 kvm-intel.emulate_invalid_guest_state=
1124 [KVM,Intel] Enable emulation of invalid guest states
1125 Default is 0 (disabled)
1126
1127 kvm-intel.flexpriority=
1128 [KVM,Intel] Disable FlexPriority feature (TPR shadow).
1129 Default is 1 (enabled)
1130
1131 kvm-intel.unrestricted_guest=
1132 [KVM,Intel] Disable unrestricted guest feature
1133 (virtualized real and unpaged mode) on capable
1134 Intel chips. Default is 1 (enabled)
1135
1136 kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
1137 feature (tagged TLBs) on capable Intel chips.
1138 Default is 1 (enabled)
1139
1097 l2cr= [PPC] 1140 l2cr= [PPC]
1098 1141
1099 l3cr= [PPC] 1142 l3cr= [PPC]
@@ -1111,6 +1154,10 @@ and is between 256 and 4096 characters. It is defined in the file
1111 libata.dma=4 Compact Flash DMA only 1154 libata.dma=4 Compact Flash DMA only
1112 Combinations also work, so libata.dma=3 enables DMA 1155 Combinations also work, so libata.dma=3 enables DMA
1113 for disks and CDROMs, but not CFs. 1156 for disks and CDROMs, but not CFs.
1157
1158 libata.ignore_hpa= [LIBATA] Ignore HPA limit
1159 libata.ignore_hpa=0 keep BIOS limits (default)
1160 libata.ignore_hpa=1 ignore limits, using full disk
1114 1161
1115 libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume 1162 libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
1116 when set. 1163 when set.
@@ -1239,6 +1286,10 @@ and is between 256 and 4096 characters. It is defined in the file
1239 (machvec) in a generic kernel. 1286 (machvec) in a generic kernel.
1240 Example: machvec=hpzx1_swiotlb 1287 Example: machvec=hpzx1_swiotlb
1241 1288
1289 machtype= [Loongson] Share the same kernel image file between different
1290 yeeloong laptop.
1291 Example: machtype=lemote-yeeloong-2f-7inch
1292
1242 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater 1293 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
1243 than or equal to this physical address is ignored. 1294 than or equal to this physical address is ignored.
1244 1295
@@ -1358,6 +1409,27 @@ and is between 256 and 4096 characters. It is defined in the file
1358 min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this 1409 min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this
1359 physical address is ignored. 1410 physical address is ignored.
1360 1411
1412 mini2440= [ARM,HW,KNL]
1413 Format:[0..2][b][c][t]
1414 Default: "0tb"
1415 MINI2440 configuration specification:
1416 0 - The attached screen is the 3.5" TFT
1417 1 - The attached screen is the 7" TFT
1418 2 - The VGA Shield is attached (1024x768)
1419 Leaving out the screen size parameter will not load
1420 the TFT driver, and the framebuffer will be left
1421 unconfigured.
1422 b - Enable backlight. The TFT backlight pin will be
1423 linked to the kernel VESA blanking code and a GPIO
1424 LED. This parameter is not necessary when using the
1425 VGA shield.
1426 c - Enable the s3c camera interface.
1427 t - Reserved for enabling touchscreen support. The
1428 touchscreen support is not enabled in the mainstream
1429 kernel as of 2.6.30, a preliminary port can be found
1430 in the "bleeding edge" mini2440 support kernel at
1431 http://repo.or.cz/w/linux-2.6/mini2440.git
1432
1361 mminit_loglevel= 1433 mminit_loglevel=
1362 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this 1434 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
1363 parameter allows control of the logging verbosity for 1435 parameter allows control of the logging verbosity for
@@ -1399,6 +1471,16 @@ and is between 256 and 4096 characters. It is defined in the file
1399 mtdparts= [MTD] 1471 mtdparts= [MTD]
1400 See drivers/mtd/cmdlinepart.c. 1472 See drivers/mtd/cmdlinepart.c.
1401 1473
1474 onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
1475
1476 Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
1477
1478 boundary - index of last SLC block on Flex-OneNAND.
1479 The remaining blocks are configured as MLC blocks.
1480 lock - Configure if Flex-OneNAND boundary should be locked.
1481 Once locked, the boundary cannot be changed.
1482 1 indicates lock status, 0 indicates unlock status.
1483
1402 mtdset= [ARM] 1484 mtdset= [ARM]
1403 ARM/S3C2412 JIVE boot control 1485 ARM/S3C2412 JIVE boot control
1404 1486
@@ -1464,6 +1546,14 @@ and is between 256 and 4096 characters. It is defined in the file
1464 [NFS] set the TCP port on which the NFSv4 callback 1546 [NFS] set the TCP port on which the NFSv4 callback
1465 channel should listen. 1547 channel should listen.
1466 1548
1549 nfs.cache_getent=
1550 [NFS] sets the pathname to the program which is used
1551 to update the NFS client cache entries.
1552
1553 nfs.cache_getent_timeout=
1554 [NFS] sets the timeout after which an attempt to
1555 update a cache entry is deemed to have failed.
1556
1467 nfs.idmap_cache_timeout= 1557 nfs.idmap_cache_timeout=
1468 [NFS] set the maximum lifetime for idmapper cache 1558 [NFS] set the maximum lifetime for idmapper cache
1469 entries. 1559 entries.
@@ -1496,6 +1586,11 @@ and is between 256 and 4096 characters. It is defined in the file
1496 symbolic names: lapic and ioapic 1586 symbolic names: lapic and ioapic
1497 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic 1587 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
1498 1588
1589 netpoll.carrier_timeout=
1590 [NET] Specifies amount of time (in seconds) that
1591 netpoll should wait for a carrier. By default netpoll
1592 waits 4 seconds.
1593
1499 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths 1594 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
1500 emulation library even if a 387 maths coprocessor 1595 emulation library even if a 387 maths coprocessor
1501 is present. 1596 is present.
@@ -1685,8 +1780,8 @@ and is between 256 and 4096 characters. It is defined in the file
1685 oprofile.cpu_type= Force an oprofile cpu type 1780 oprofile.cpu_type= Force an oprofile cpu type
1686 This might be useful if you have an older oprofile 1781 This might be useful if you have an older oprofile
1687 userland or if you want common events. 1782 userland or if you want common events.
1688 Format: { archperfmon } 1783 Format: { arch_perfmon }
1689 archperfmon: [X86] Force use of architectural 1784 arch_perfmon: [X86] Force use of architectural
1690 perfmon on Intel CPUs instead of the 1785 perfmon on Intel CPUs instead of the
1691 CPU specific event set. 1786 CPU specific event set.
1692 1787
@@ -1765,6 +1860,9 @@ and is between 256 and 4096 characters. It is defined in the file
1765 root domains (aka PCI segments, in ACPI-speak). 1860 root domains (aka PCI segments, in ACPI-speak).
1766 nommconf [X86] Disable use of MMCONFIG for PCI 1861 nommconf [X86] Disable use of MMCONFIG for PCI
1767 Configuration 1862 Configuration
1863 check_enable_amd_mmconf [X86] check for and enable
1864 properly configured MMIO access to PCI
1865 config space on AMD family 10h CPU
1768 nomsi [MSI] If the PCI_MSI kernel config parameter is 1866 nomsi [MSI] If the PCI_MSI kernel config parameter is
1769 enabled, this kernel boot option can be used to 1867 enabled, this kernel boot option can be used to
1770 disable the use of MSI interrupts system-wide. 1868 disable the use of MSI interrupts system-wide.
@@ -1854,6 +1952,12 @@ and is between 256 and 4096 characters. It is defined in the file
1854 PAGE_SIZE is used as alignment. 1952 PAGE_SIZE is used as alignment.
1855 PCI-PCI bridge can be specified, if resource 1953 PCI-PCI bridge can be specified, if resource
1856 windows need to be expanded. 1954 windows need to be expanded.
1955 ecrc= Enable/disable PCIe ECRC (transaction layer
1956 end-to-end CRC checking).
1957 bios: Use BIOS/firmware settings. This is the
1958 the default.
1959 off: Turn ECRC off
1960 on: Turn ECRC on.
1857 1961
1858 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power 1962 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
1859 Management. 1963 Management.
@@ -1871,6 +1975,13 @@ and is between 256 and 4096 characters. It is defined in the file
1871 Format: { 0 | 1 } 1975 Format: { 0 | 1 }
1872 See arch/parisc/kernel/pdc_chassis.c 1976 See arch/parisc/kernel/pdc_chassis.c
1873 1977
1978 percpu_alloc= Select which percpu first chunk allocator to use.
1979 Currently supported values are "embed" and "page".
1980 Archs may support subset or none of the selections.
1981 See comments in mm/percpu.c for details on each
1982 allocator. This parameter is primarily for debugging
1983 and performance comparison.
1984
1874 pf. [PARIDE] 1985 pf. [PARIDE]
1875 See Documentation/blockdev/paride.txt. 1986 See Documentation/blockdev/paride.txt.
1876 1987
@@ -2341,6 +2452,18 @@ and is between 256 and 4096 characters. It is defined in the file
2341 stifb= [HW] 2452 stifb= [HW]
2342 Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] 2453 Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
2343 2454
2455 sunrpc.min_resvport=
2456 sunrpc.max_resvport=
2457 [NFS,SUNRPC]
2458 SunRPC servers often require that client requests
2459 originate from a privileged port (i.e. a port in the
2460 range 0 < portnr < 1024).
2461 An administrator who wishes to reserve some of these
2462 ports for other uses may adjust the range that the
2463 kernel's sunrpc client considers to be privileged
2464 using these two parameters to set the minimum and
2465 maximum port values.
2466
2344 sunrpc.pool_mode= 2467 sunrpc.pool_mode=
2345 [NFS] 2468 [NFS]
2346 Control how the NFS server code allocates CPUs to 2469 Control how the NFS server code allocates CPUs to
@@ -2357,6 +2480,15 @@ and is between 256 and 4096 characters. It is defined in the file
2357 pernode one pool for each NUMA node (equivalent 2480 pernode one pool for each NUMA node (equivalent
2358 to global on non-NUMA machines) 2481 to global on non-NUMA machines)
2359 2482
2483 sunrpc.tcp_slot_table_entries=
2484 sunrpc.udp_slot_table_entries=
2485 [NFS,SUNRPC]
2486 Sets the upper limit on the number of simultaneous
2487 RPC calls that can be sent from the client to a
2488 server. Increasing these values may allow you to
2489 improve throughput, but will also increase the
2490 amount of memory reserved for use by the client.
2491
2360 swiotlb= [IA-64] Number of I/O TLB slabs 2492 swiotlb= [IA-64] Number of I/O TLB slabs
2361 2493
2362 switches= [HW,M68k] 2494 switches= [HW,M68k]
@@ -2423,7 +2555,13 @@ and is between 256 and 4096 characters. It is defined in the file
2423 2555
2424 tp720= [HW,PS2] 2556 tp720= [HW,PS2]
2425 2557
2426 trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size. 2558 trace_buf_size=nn[KMG]
2559 [FTRACE] will set tracing buffer size.
2560
2561 trace_event=[event-list]
2562 [FTRACE] Set and start specified trace events in order
2563 to facilitate early boot debugging.
2564 See also Documentation/trace/events.txt
2427 2565
2428 trix= [HW,OSS] MediaTrix AudioTrix Pro 2566 trix= [HW,OSS] MediaTrix AudioTrix Pro
2429 Format: 2567 Format:
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index b56aacc1fff8..e4dbbdb1bd96 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -26,7 +26,7 @@ This document has the following sections:
26 - Notes on accessing payload contents 26 - Notes on accessing payload contents
27 - Defining a key type 27 - Defining a key type
28 - Request-key callback service 28 - Request-key callback service
29 - Key access filesystem 29 - Garbage collection
30 30
31 31
32============ 32============
@@ -113,6 +113,9 @@ Each key has a number of attributes:
113 113
114 (*) Dead. The key's type was unregistered, and so the key is now useless. 114 (*) Dead. The key's type was unregistered, and so the key is now useless.
115 115
116Keys in the last three states are subject to garbage collection. See the
117section on "Garbage collection".
118
116 119
117==================== 120====================
118KEY SERVICE OVERVIEW 121KEY SERVICE OVERVIEW
@@ -754,6 +757,26 @@ The keyctl syscall functions are:
754 successful. 757 successful.
755 758
756 759
760 (*) Install the calling process's session keyring on its parent.
761
762 long keyctl(KEYCTL_SESSION_TO_PARENT);
763
764 This functions attempts to install the calling process's session keyring
765 on to the calling process's parent, replacing the parent's current session
766 keyring.
767
768 The calling process must have the same ownership as its parent, the
769 keyring must have the same ownership as the calling process, the calling
770 process must have LINK permission on the keyring and the active LSM module
771 mustn't deny permission, otherwise error EPERM will be returned.
772
773 Error ENOMEM will be returned if there was insufficient memory to complete
774 the operation, otherwise 0 will be returned to indicate success.
775
776 The keyring will be replaced next time the parent process leaves the
777 kernel and resumes executing userspace.
778
779
757=============== 780===============
758KERNEL SERVICES 781KERNEL SERVICES
759=============== 782===============
@@ -1231,3 +1254,17 @@ by executing:
1231 1254
1232In this case, the program isn't required to actually attach the key to a ring; 1255In this case, the program isn't required to actually attach the key to a ring;
1233the rings are provided for reference. 1256the rings are provided for reference.
1257
1258
1259==================
1260GARBAGE COLLECTION
1261==================
1262
1263Dead keys (for which the type has been removed) will be automatically unlinked
1264from those keyrings that point to them and deleted as soon as possible by a
1265background garbage collector.
1266
1267Similarly, revoked and expired keys will be garbage collected, but only after a
1268certain amount of time has passed. This time is set as a number of seconds in:
1269
1270 /proc/sys/kernel/keys/gc_delay
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 000000000000..363044609dad
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,773 @@
1GETTING STARTED WITH KMEMCHECK
2==============================
3
4Vegard Nossum <vegardno@ifi.uio.no>
5
6
7Contents
8========
90. Introduction
101. Downloading
112. Configuring and compiling
123. How to use
133.1. Booting
143.2. Run-time enable/disable
153.3. Debugging
163.4. Annotating false positives
174. Reporting errors
185. Technical description
19
20
210. Introduction
22===============
23
24kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
25is a dynamic checker that detects and warns about some uses of uninitialized
26memory.
27
28Userspace programmers might be familiar with Valgrind's memcheck. The main
29difference between memcheck and kmemcheck is that memcheck works for userspace
30programs only, and kmemcheck works for the kernel only. The implementations
31are of course vastly different. Because of this, kmemcheck is not as accurate
32as memcheck, but it turns out to be good enough in practice to discover real
33programmer errors that the compiler is not able to find through static
34analysis.
35
36Enabling kmemcheck on a kernel will probably slow it down to the extent that
37the machine will not be usable for normal workloads such as e.g. an
38interactive desktop. kmemcheck will also cause the kernel to use about twice
39as much memory as normal. For this reason, kmemcheck is strictly a debugging
40feature.
41
42
431. Downloading
44==============
45
46kmemcheck can only be downloaded using git. If you want to write patches
47against the current code, you should use the kmemcheck development branch of
48the tip tree. It is also possible to use the linux-next tree, which also
49includes the latest version of kmemcheck.
50
51Assuming that you've already cloned the linux-2.6.git repository, all you
52have to do is add the -tip tree as a remote, like this:
53
54 $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
55
56To actually download the tree, fetch the remote:
57
58 $ git fetch tip
59
60And to check out a new local branch with the kmemcheck code:
61
62 $ git checkout -b kmemcheck tip/kmemcheck
63
64General instructions for the -tip tree can be found here:
65http://people.redhat.com/mingo/tip.git/readme.txt
66
67
682. Configuring and compiling
69============================
70
71kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
72configuration variables must have specific settings in order for the kmemcheck
73menu to even appear in "menuconfig". These are:
74
75 o CONFIG_CC_OPTIMIZE_FOR_SIZE=n
76
77 This option is located under "General setup" / "Optimize for size".
78
79 Without this, gcc will use certain optimizations that usually lead to
80 false positive warnings from kmemcheck. An example of this is a 16-bit
81 field in a struct, where gcc may load 32 bits, then discard the upper
82 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
83 warning for the upper 16 bits (if they're uninitialized).
84
85 o CONFIG_SLAB=y or CONFIG_SLUB=y
86
87 This option is located under "General setup" / "Choose SLAB
88 allocator".
89
90 o CONFIG_FUNCTION_TRACER=n
91
92 This option is located under "Kernel hacking" / "Tracers" / "Kernel
93 Function Tracer"
94
95 When function tracing is compiled in, gcc emits a call to another
96 function at the beginning of every function. This means that when the
97 page fault handler is called, the ftrace framework will be called
98 before kmemcheck has had a chance to handle the fault. If ftrace then
99 modifies memory that was tracked by kmemcheck, the result is an
100 endless recursive page fault.
101
102 o CONFIG_DEBUG_PAGEALLOC=n
103
104 This option is located under "Kernel hacking" / "Debug page memory
105 allocations".
106
107In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also
108located under "Kernel hacking". With this, you will be able to get line number
109information from the kmemcheck warnings, which is extremely valuable in
110debugging a problem. This option is not mandatory, however, because it slows
111down the compilation process and produces a much bigger kernel image.
112
113Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck:
114trap use of uninitialized memory"). Here follows a description of the
115kmemcheck configuration variables:
116
117 o CONFIG_KMEMCHECK
118
119 This must be enabled in order to use kmemcheck at all...
120
121 o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT
122
123 This option controls the status of kmemcheck at boot-time. "Enabled"
124 will enable kmemcheck right from the start, "disabled" will boot the
125 kernel as normal (but with the kmemcheck code compiled in, so it can
126 be enabled at run-time after the kernel has booted), and "one-shot" is
127 a special mode which will turn kmemcheck off automatically after
128 detecting the first use of uninitialized memory.
129
130 If you are using kmemcheck to actively debug a problem, then you
131 probably want to choose "enabled" here.
132
133 The one-shot mode is mostly useful in automated test setups because it
134 can prevent floods of warnings and increase the chances of the machine
135 surviving in case something is really wrong. In other cases, the one-
136 shot mode could actually be counter-productive because it would turn
137 itself off at the very first error -- in the case of a false positive
138 too -- and this would come in the way of debugging the specific
139 problem you were interested in.
140
141 If you would like to use your kernel as normal, but with a chance to
142 enable kmemcheck in case of some problem, it might be a good idea to
143 choose "disabled" here. When kmemcheck is disabled, most of the run-
144 time overhead is not incurred, and the kernel will be almost as fast
145 as normal.
146
147 o CONFIG_KMEMCHECK_QUEUE_SIZE
148
149 Select the maximum number of error reports to store in an internal
150 (fixed-size) buffer. Since errors can occur virtually anywhere and in
151 any context, we need a temporary storage area which is guaranteed not
152 to generate any other page faults when accessed. The queue will be
153 emptied as soon as a tasklet may be scheduled. If the queue is full,
154 new error reports will be lost.
155
156 The default value of 64 is probably fine. If some code produces more
157 than 64 errors within an irqs-off section, then the code is likely to
158 produce many, many more, too, and these additional reports seldom give
159 any more information (the first report is usually the most valuable
160 anyway).
161
162 This number might have to be adjusted if you are not using serial
163 console or similar to capture the kernel log. If you are using the
164 "dmesg" command to save the log, then getting a lot of kmemcheck
165 warnings might overflow the kernel log itself, and the earlier reports
166 will get lost in that way instead. Try setting this to 10 or so on
167 such a setup.
168
169 o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT
170
171 Select the number of shadow bytes to save along with each entry of the
172 error-report queue. These bytes indicate what parts of an allocation
173 are initialized, uninitialized, etc. and will be displayed when an
174 error is detected to help the debugging of a particular problem.
175
176 The number entered here is actually the logarithm of the number of
177 bytes that will be saved. So if you pick for example 5 here, kmemcheck
178 will save 2^5 = 32 bytes.
179
180 The default value should be fine for debugging most problems. It also
181 fits nicely within 80 columns.
182
183 o CONFIG_KMEMCHECK_PARTIAL_OK
184
185 This option (when enabled) works around certain GCC optimizations that
186 produce 32-bit reads from 16-bit variables where the upper 16 bits are
187 thrown away afterwards.
188
189 The default value (enabled) is recommended. This may of course hide
190 some real errors, but disabling it would probably produce a lot of
191 false positives.
192
193 o CONFIG_KMEMCHECK_BITOPS_OK
194
195 This option silences warnings that would be generated for bit-field
196 accesses where not all the bits are initialized at the same time. This
197 may also hide some real bugs.
198
199 This option is probably obsolete, or it should be replaced with
200 the kmemcheck-/bitfield-annotations for the code in question. The
201 default value is therefore fine.
202
203Now compile the kernel as usual.
204
205
2063. How to use
207=============
208
2093.1. Booting
210============
211
212First some information about the command-line options. There is only one
213option specific to kmemcheck, and this is called "kmemcheck". It can be used
214to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT
215option. Its possible settings are:
216
217 o kmemcheck=0 (disabled)
218 o kmemcheck=1 (enabled)
219 o kmemcheck=2 (one-shot mode)
220
221If SLUB debugging has been enabled in the kernel, it may take precedence over
222kmemcheck in such a way that the slab caches which are under SLUB debugging
223will not be tracked by kmemcheck. In order to ensure that this doesn't happen
224(even though it shouldn't by default), use SLUB's boot option "slub_debug",
225like this: slub_debug=-
226
227In fact, this option may also be used for fine-grained control over SLUB vs.
228kmemcheck. For example, if the command line includes "kmemcheck=1
229slub_debug=,dentry", then SLUB debugging will be used only for the "dentry"
230slab cache, and with kmemcheck tracking all the other caches. This is advanced
231usage, however, and is not generally recommended.
232
233
2343.2. Run-time enable/disable
235============================
236
237When the kernel has booted, it is possible to enable or disable kmemcheck at
238run-time. WARNING: This feature is still experimental and may cause false
239positive warnings to appear. Therefore, try not to use this. If you find that
240it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
241will be happy to take bug reports.
242
243Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.:
244
245 $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
246
247The numbers are the same as for the kmemcheck= command-line option.
248
249
2503.3. Debugging
251==============
252
253A typical report will look something like this:
254
255WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
25680000000000000000000000000000000000000000088ffff0000000000000000
257 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
258 ^
259
260Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
261RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
262RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
263RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
264RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
265RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
266R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
267R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
268FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
269CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
270CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
271DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
272DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
273 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
274 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
275 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
276 [<ffffffff8100c7b5>] int_signal+0x12/0x17
277 [<ffffffffffffffff>] 0xffffffffffffffff
278
279The single most valuable information in this report is the RIP (or EIP on 32-
280bit) value. This will help us pinpoint exactly which instruction that caused
281the warning.
282
283If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do
284is give this address to the addr2line program, like this:
285
286 $ addr2line -e vmlinux -i ffffffff8104ede8
287 arch/x86/include/asm/string_64.h:12
288 include/asm-generic/siginfo.h:287
289 kernel/signal.c:380
290 kernel/signal.c:410
291
292The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must
293be the vmlinux of the kernel that produced the warning in the first place! If
294not, the line number information will almost certainly be wrong.
295
296The "-i" tells addr2line to also print the line numbers of inlined functions.
297In this case, the flag was very important, because otherwise, it would only
298have printed the first line, which is just a call to memcpy(), which could be
299called from a thousand places in the kernel, and is therefore not very useful.
300These inlined functions would not show up in the stack trace above, simply
301because the kernel doesn't load the extra debugging information. This
302technique can of course be used with ordinary kernel oopses as well.
303
304In this case, it's the caller of memcpy() that is interesting, and it can be
305found in include/asm-generic/siginfo.h, line 287:
306
307281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
308282 {
309283 if (from->si_code < 0)
310284 memcpy(to, from, sizeof(*to));
311285 else
312286 /* _sigchld is currently the largest know union member */
313287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
314288 }
315
316Since this was a read (kmemcheck usually warns about reads only, though it can
317warn about writes to unallocated or freed memory as well), it was probably the
318"from" argument which contained some uninitialized bytes. Following the chain
319of calls, we move upwards to see where "from" was allocated or initialized,
320kernel/signal.c, line 380:
321
322359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
323360 {
324...
325367 list_for_each_entry(q, &list->list, list) {
326368 if (q->info.si_signo == sig) {
327369 if (first)
328370 goto still_pending;
329371 first = q;
330...
331377 if (first) {
332378 still_pending:
333379 list_del_init(&first->list);
334380 copy_siginfo(info, &first->info);
335381 __sigqueue_free(first);
336...
337392 }
338393 }
339
340Here, it is &first->info that is being passed on to copy_siginfo(). The
341variable "first" was found on a list -- passed in as the second argument to
342collect_signal(). We continue our journey through the stack, to figure out
343where the item on "list" was allocated or initialized. We move to line 410:
344
345395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
346396 siginfo_t *info)
347397 {
348...
349410 collect_signal(sig, pending, info);
350...
351414 }
352
353Now we need to follow the "pending" pointer, since that is being passed on to
354collect_signal() as "list". At this point, we've run out of lines from the
355"addr2line" output. Not to worry, we just paste the next addresses from the
356kmemcheck stack dump, i.e.:
357
358 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
359 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
360 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
361 [<ffffffff8100c7b5>] int_signal+0x12/0x17
362
363 $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
364 ffffffff8100b87d ffffffff8100c7b5
365 kernel/signal.c:446
366 kernel/signal.c:1806
367 arch/x86/kernel/signal.c:805
368 arch/x86/kernel/signal.c:871
369 arch/x86/kernel/entry_64.S:694
370
371Remember that since these addresses were found on the stack and not as the
372RIP value, they actually point to the _next_ instruction (they are return
373addresses). This becomes obvious when we look at the code for line 446:
374
375422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
376423 {
377...
378431 signr = __dequeue_signal(&tsk->signal->shared_pending,
379432 mask, info);
380433 /*
381434 * itimer signal ?
382435 *
383436 * itimers are process shared and we restart periodic
384437 * itimers in the signal delivery path to prevent DoS
385438 * attacks in the high resolution timer case. This is
386439 * compliant with the old way of self restarting
387440 * itimers, as the SIGALRM is a legacy signal and only
388441 * queued once. Changing the restart behaviour to
389442 * restart the timer in the signal dequeue path is
390443 * reducing the timer noise on heavy loaded !highres
391444 * systems too.
392445 */
393446 if (unlikely(signr == SIGALRM)) {
394...
395489 }
396
397So instead of looking at 446, we should be looking at 431, which is the line
398that executes just before 446. Here we see that what we are looking for is
399&tsk->signal->shared_pending.
400
401Our next task is now to figure out which function that puts items on this
402"shared_pending" list. A crude, but efficient tool, is git grep:
403
404 $ git grep -n 'shared_pending' kernel/
405 ...
406 kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
407 kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
408 ...
409
410There were more results, but none of them were related to list operations,
411and these were the only assignments. We inspect the line numbers more closely
412and find that this is indeed where items are being added to the list:
413
414816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
415817 int group)
416818 {
417...
418828 pending = group ? &t->signal->shared_pending : &t->pending;
419...
420851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
421852 (is_si_special(info) ||
422853 info->si_code >= 0)));
423854 if (q) {
424855 list_add_tail(&q->list, &pending->list);
425...
426890 }
427
428and:
429
4301309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
4311310 {
432....
4331339 pending = group ? &t->signal->shared_pending : &t->pending;
4341340 list_add_tail(&q->list, &pending->list);
435....
4361347 }
437
438In the first case, the list element we are looking for, "q", is being returned
439from the function __sigqueue_alloc(), which looks like an allocation function.
440Let's take a look at it:
441
442187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
443188 int override_rlimit)
444189 {
445190 struct sigqueue *q = NULL;
446191 struct user_struct *user;
447192
448193 /*
449194 * We won't get problems with the target's UID changing under us
450195 * because changing it requires RCU be used, and if t != current, the
451196 * caller must be holding the RCU readlock (by way of a spinlock) and
452197 * we use RCU protection here
453198 */
454199 user = get_uid(__task_cred(t)->user);
455200 atomic_inc(&user->sigpending);
456201 if (override_rlimit ||
457202 atomic_read(&user->sigpending) <=
458203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
459204 q = kmem_cache_alloc(sigqueue_cachep, flags);
460205 if (unlikely(q == NULL)) {
461206 atomic_dec(&user->sigpending);
462207 free_uid(user);
463208 } else {
464209 INIT_LIST_HEAD(&q->list);
465210 q->flags = 0;
466211 q->user = user;
467212 }
468213
469214 return q;
470215 }
471
472We see that this function initializes q->list, q->flags, and q->user. It seems
473that now is the time to look at the definition of "struct sigqueue", e.g.:
474
47514 struct sigqueue {
47615 struct list_head list;
47716 int flags;
47817 siginfo_t info;
47918 struct user_struct *user;
48019 };
481
482And, you might remember, it was a memcpy() on &first->info that caused the
483warning, so this makes perfect sense. It also seems reasonable to assume that
484it is the caller of __sigqueue_alloc() that has the responsibility of filling
485out (initializing) this member.
486
487But just which fields of the struct were uninitialized? Let's look at
488kmemcheck's report again:
489
490WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
49180000000000000000000000000000000000000000088ffff0000000000000000
492 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
493 ^
494
495These first two lines are the memory dump of the memory object itself, and the
496shadow bytemap, respectively. The memory object itself is in this case
497&first->info. Just beware that the start of this dump is NOT the start of the
498object itself! The position of the caret (^) corresponds with the address of
499the read (ffff88003e4a2024).
500
501The shadow bytemap dump legend is as follows:
502
503 i - initialized
504 u - uninitialized
505 a - unallocated (memory has been allocated by the slab layer, but has not
506 yet been handed off to anybody)
507 f - freed (memory has been allocated by the slab layer, but has been freed
508 by the previous owner)
509
510In order to figure out where (relative to the start of the object) the
511uninitialized memory was located, we have to look at the disassembly. For
512that, we'll need the RIP address again:
513
514RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
515
516 $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
517 ffffffff8104edc8: mov %r8,0x8(%r8)
518 ffffffff8104edcc: test %r10d,%r10d
519 ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168>
520 ffffffff8104edd5: mov %rax,%rdx
521 ffffffff8104edd8: mov $0xc,%ecx
522 ffffffff8104eddd: mov %r13,%rdi
523 ffffffff8104ede0: mov $0x30,%eax
524 ffffffff8104ede5: mov %rdx,%rsi
525 ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi)
526 ffffffff8104edea: test $0x2,%al
527 ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0>
528 ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi)
529 ffffffff8104edf0: test $0x1,%al
530 ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5>
531 ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi)
532 ffffffff8104edf5: mov %r8,%rdi
533 ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free>
534
535As expected, it's the "rep movsl" instruction from the memcpy() that causes
536the warning. We know about REP MOVSL that it uses the register RCX to count
537the number of remaining iterations. By taking a look at the register dump
538again (from the kmemcheck report), we can figure out how many bytes were left
539to copy:
540
541RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
542
543By looking at the disassembly, we also see that %ecx is being loaded with the
544value $0xc just before (ffffffff8104edd8), so we are very lucky. Keep in mind
545that this is the number of iterations, not bytes. And since this is a "long"
546operation, we need to multiply by 4 to get the number of bytes. So this means
547that the uninitialized value was encountered at 4 * (0xc - 0x9) = 12 bytes
548from the start of the object.
549
550We can now try to figure out which field of the "struct siginfo" that was not
551initialized. This is the beginning of the struct:
552
55340 typedef struct siginfo {
55441 int si_signo;
55542 int si_errno;
55643 int si_code;
55744
55845 union {
559..
56092 } _sifields;
56193 } siginfo_t;
562
563On 64-bit, the int is 4 bytes long, so it must the the union member that has
564not been initialized. We can verify this using gdb:
565
566 $ gdb vmlinux
567 ...
568 (gdb) p &((struct siginfo *) 0)->_sifields
569 $1 = (union {...} *) 0x10
570
571Actually, it seems that the union member is located at offset 0x10 -- which
572means that gcc has inserted 4 bytes of padding between the members si_code
573and _sifields. We can now get a fuller picture of the memory dump:
574
575 _----------------------------=> si_code
576 / _--------------------=> (padding)
577 | / _------------=> _sifields(._kill._pid)
578 | | / _----=> _sifields(._kill._uid)
579 | | | /
580-------|-------|-------|-------|
58180000000000000000000000000000000000000000088ffff0000000000000000
582 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
583
584This allows us to realize another important fact: si_code contains the value
5850x80. Remember that x86 is little endian, so the first 4 bytes "80000000" are
586really the number 0x00000080. With a bit of research, we find that this is
587actually the constant SI_KERNEL defined in include/asm-generic/siginfo.h:
588
589144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
590
591This macro is used in exactly one place in the x86 kernel: In send_signal()
592in kernel/signal.c:
593
594816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
595817 int group)
596818 {
597...
598828 pending = group ? &t->signal->shared_pending : &t->pending;
599...
600851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
601852 (is_si_special(info) ||
602853 info->si_code >= 0)));
603854 if (q) {
604855 list_add_tail(&q->list, &pending->list);
605856 switch ((unsigned long) info) {
606...
607865 case (unsigned long) SEND_SIG_PRIV:
608866 q->info.si_signo = sig;
609867 q->info.si_errno = 0;
610868 q->info.si_code = SI_KERNEL;
611869 q->info.si_pid = 0;
612870 q->info.si_uid = 0;
613871 break;
614...
615890 }
616
617Not only does this match with the .si_code member, it also matches the place
618we found earlier when looking for where siginfo_t objects are enqueued on the
619"shared_pending" list.
620
621So to sum up: It seems that it is the padding introduced by the compiler
622between two struct fields that is uninitialized, and this gets reported when
623we do a memcpy() on the struct. This means that we have identified a false
624positive warning.
625
626Normally, kmemcheck will not report uninitialized accesses in memcpy() calls
627when both the source and destination addresses are tracked. (Instead, we copy
628the shadow bytemap as well). In this case, the destination address clearly
629was not tracked. We can dig a little deeper into the stack trace from above:
630
631 arch/x86/kernel/signal.c:805
632 arch/x86/kernel/signal.c:871
633 arch/x86/kernel/entry_64.S:694
634
635And we clearly see that the destination siginfo object is located on the
636stack:
637
638782 static void do_signal(struct pt_regs *regs)
639783 {
640784 struct k_sigaction ka;
641785 siginfo_t info;
642...
643804 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
644...
645854 }
646
647And this &info is what eventually gets passed to copy_siginfo() as the
648destination argument.
649
650Now, even though we didn't find an actual error here, the example is still a
651good one, because it shows how one would go about to find out what the report
652was all about.
653
654
6553.4. Annotating false positives
656===============================
657
658There are a few different ways to make annotations in the source code that
659will keep kmemcheck from checking and reporting certain allocations. Here
660they are:
661
662 o __GFP_NOTRACK_FALSE_POSITIVE
663
664 This flag can be passed to kmalloc() or kmem_cache_alloc() (therefore
665 also to other functions that end up calling one of these) to indicate
666 that the allocation should not be tracked because it would lead to
667 a false positive report. This is a "big hammer" way of silencing
668 kmemcheck; after all, even if the false positive pertains to
669 particular field in a struct, for example, we will now lose the
670 ability to find (real) errors in other parts of the same struct.
671
672 Example:
673
674 /* No warnings will ever trigger on accessing any part of x */
675 x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
676
677 o kmemcheck_bitfield_begin(name)/kmemcheck_bitfield_end(name) and
678 kmemcheck_annotate_bitfield(ptr, name)
679
680 The first two of these three macros can be used inside struct
681 definitions to signal, respectively, the beginning and end of a
682 bitfield. Additionally, this will assign the bitfield a name, which
683 is given as an argument to the macros.
684
685 Having used these markers, one can later use
686 kmemcheck_annotate_bitfield() at the point of allocation, to indicate
687 which parts of the allocation is part of a bitfield.
688
689 Example:
690
691 struct foo {
692 int x;
693
694 kmemcheck_bitfield_begin(flags);
695 int flag_a:1;
696 int flag_b:1;
697 kmemcheck_bitfield_end(flags);
698
699 int y;
700 };
701
702 struct foo *x = kmalloc(sizeof *x);
703
704 /* No warnings will trigger on accessing the bitfield of x */
705 kmemcheck_annotate_bitfield(x, flags);
706
707 Note that kmemcheck_annotate_bitfield() can be used even before the
708 return value of kmalloc() is checked -- in other words, passing NULL
709 as the first argument is legal (and will do nothing).
710
711
7124. Reporting errors
713===================
714
715As we have seen, kmemcheck will produce false positive reports. Therefore, it
716is not very wise to blindly post kmemcheck warnings to mailing lists and
717maintainers. Instead, I encourage maintainers and developers to find errors
718in their own code. If you get a warning, you can try to work around it, try
719to figure out if it's a real error or not, or simply ignore it. Most
720developers know their own code and will quickly and efficiently determine the
721root cause of a kmemcheck report. This is therefore also the most efficient
722way to work with kmemcheck.
723
724That said, we (the kmemcheck maintainers) will always be on the lookout for
725false positives that we can annotate and silence. So whatever you find,
726please drop us a note privately! Kernel configs and steps to reproduce (if
727available) are of course a great help too.
728
729Happy hacking!
730
731
7325. Technical description
733========================
734
735kmemcheck works by marking memory pages non-present. This means that whenever
736somebody attempts to access the page, a page fault is generated. The page
737fault handler notices that the page was in fact only hidden, and so it calls
738on the kmemcheck code to make further investigations.
739
740When the investigations are completed, kmemcheck "shows" the page by marking
741it present (as it would be under normal circumstances). This way, the
742interrupted code can continue as usual.
743
744But after the instruction has been executed, we should hide the page again, so
745that we can catch the next access too! Now kmemcheck makes use of a debugging
746feature of the processor, namely single-stepping. When the processor has
747finished the one instruction that generated the memory access, a debug
748exception is raised. From here, we simply hide the page again and continue
749execution, this time with the single-stepping feature turned off.
750
751kmemcheck requires some assistance from the memory allocator in order to work.
752The memory allocator needs to
753
754 1. Tell kmemcheck about newly allocated pages and pages that are about to
755 be freed. This allows kmemcheck to set up and tear down the shadow memory
756 for the pages in question. The shadow memory stores the status of each
757 byte in the allocation proper, e.g. whether it is initialized or
758 uninitialized.
759
760 2. Tell kmemcheck which parts of memory should be marked uninitialized.
761 There are actually a few more states, such as "not yet allocated" and
762 "recently freed".
763
764If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
765memory that can take page faults because of kmemcheck.
766
767If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
768request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
769This does not prevent the page faults from occurring, however, but marks the
770object in question as being initialized so that no warnings will ever be
771produced for this object.
772
773Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
index 0112da3b9ab8..34f6638aa5ac 100644
--- a/Documentation/kmemleak.txt
+++ b/Documentation/kmemleak.txt
@@ -16,13 +16,24 @@ Usage
16----- 16-----
17 17
18CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel 18CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
19thread scans the memory every 10 minutes (by default) and prints any new 19thread scans the memory every 10 minutes (by default) and prints the
20unreferenced objects found. To trigger an intermediate scan and display 20number of new unreferenced objects found. To display the details of all
21all the possible memory leaks: 21the possible memory leaks:
22 22
23 # mount -t debugfs nodev /sys/kernel/debug/ 23 # mount -t debugfs nodev /sys/kernel/debug/
24 # cat /sys/kernel/debug/kmemleak 24 # cat /sys/kernel/debug/kmemleak
25 25
26To trigger an intermediate memory scan:
27
28 # echo scan > /sys/kernel/debug/kmemleak
29
30To clear the list of all current possible memory leaks:
31
32 # echo clear > /sys/kernel/debug/kmemleak
33
34New leaks will then come up upon reading /sys/kernel/debug/kmemleak
35again.
36
26Note that the orphan objects are listed in the order they were allocated 37Note that the orphan objects are listed in the order they were allocated
27and one object at the beginning of the list may cause other subsequent 38and one object at the beginning of the list may cause other subsequent
28objects to be reported as orphan. 39objects to be reported as orphan.
@@ -31,16 +42,24 @@ Memory scanning parameters can be modified at run-time by writing to the
31/sys/kernel/debug/kmemleak file. The following parameters are supported: 42/sys/kernel/debug/kmemleak file. The following parameters are supported:
32 43
33 off - disable kmemleak (irreversible) 44 off - disable kmemleak (irreversible)
34 stack=on - enable the task stacks scanning 45 stack=on - enable the task stacks scanning (default)
35 stack=off - disable the tasks stacks scanning 46 stack=off - disable the tasks stacks scanning
36 scan=on - start the automatic memory scanning thread 47 scan=on - start the automatic memory scanning thread (default)
37 scan=off - stop the automatic memory scanning thread 48 scan=off - stop the automatic memory scanning thread
38 scan=<secs> - set the automatic memory scanning period in seconds (0 49 scan=<secs> - set the automatic memory scanning period in seconds
39 to disable it) 50 (default 600, 0 to stop the automatic scanning)
51 scan - trigger a memory scan
52 clear - clear list of current memory leak suspects, done by
53 marking all current reported unreferenced objects grey
54 dump=<addr> - dump information about the object found at <addr>
40 55
41Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on 56Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
42the kernel command line. 57the kernel command line.
43 58
59Memory may be allocated or freed before kmemleak is initialised and
60these actions are stored in an early log buffer. The size of this buffer
61is configured via the CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE option.
62
44Basic Algorithm 63Basic Algorithm
45--------------- 64---------------
46 65
@@ -77,6 +96,27 @@ avoid this, kmemleak can also store the number of values pointing to an
77address inside the block address range that need to be found so that the 96address inside the block address range that need to be found so that the
78block is not considered a leak. One example is __vmalloc(). 97block is not considered a leak. One example is __vmalloc().
79 98
99Testing specific sections with kmemleak
100---------------------------------------
101
102Upon initial bootup your /sys/kernel/debug/kmemleak output page may be
103quite extensive. This can also be the case if you have very buggy code
104when doing development. To work around these situations you can use the
105'clear' command to clear all reported unreferenced objects from the
106/sys/kernel/debug/kmemleak output. By issuing a 'scan' after a 'clear'
107you can find new unreferenced objects; this should help with testing
108specific sections of code.
109
110To test a critical section on demand with a clean kmemleak do:
111
112 # echo clear > /sys/kernel/debug/kmemleak
113 ... test your kernel or modules ...
114 # echo scan > /sys/kernel/debug/kmemleak
115
116Then as usual to get your report with:
117
118 # cat /sys/kernel/debug/kmemleak
119
80Kmemleak API 120Kmemleak API
81------------ 121------------
82 122
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 1e7a769a10f9..053037a1fe6d 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -507,9 +507,9 @@ http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
507Appendix A: The kprobes debugfs interface 507Appendix A: The kprobes debugfs interface
508 508
509With recent kernels (> 2.6.20) the list of registered kprobes is visible 509With recent kernels (> 2.6.20) the list of registered kprobes is visible
510under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug). 510under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at //sys/kernel/debug).
511 511
512/debug/kprobes/list: Lists all registered probes on the system 512/sys/kernel/debug/kprobes/list: Lists all registered probes on the system
513 513
514c015d71a k vfs_read+0x0 514c015d71a k vfs_read+0x0
515c011a316 j do_fork+0x0 515c011a316 j do_fork+0x0
@@ -525,7 +525,7 @@ virtual addresses that correspond to modules that've been unloaded),
525such probes are marked with [GONE]. If the probe is temporarily disabled, 525such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED]. 526such probes are marked with [DISABLED].
527 527
528/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. 528/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
529 529
530Provides a knob to globally and forcibly turn registered kprobes ON or OFF. 530Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
531By default, all kprobes are enabled. By echoing "0" to this file, all 531By default, all kprobes are enabled. By echoing "0" to this file, all
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
new file mode 100644
index 000000000000..5a4bc8cf6d04
--- /dev/null
+++ b/Documentation/kvm/api.txt
@@ -0,0 +1,759 @@
1The Definitive KVM (Kernel-based Virtual Machine) API Documentation
2===================================================================
3
41. General description
5
6The kvm API is a set of ioctls that are issued to control various aspects
7of a virtual machine. The ioctls belong to three classes
8
9 - System ioctls: These query and set global attributes which affect the
10 whole kvm subsystem. In addition a system ioctl is used to create
11 virtual machines
12
13 - VM ioctls: These query and set attributes that affect an entire virtual
14 machine, for example memory layout. In addition a VM ioctl is used to
15 create virtual cpus (vcpus).
16
17 Only run VM ioctls from the same process (address space) that was used
18 to create the VM.
19
20 - vcpu ioctls: These query and set attributes that control the operation
21 of a single virtual cpu.
22
23 Only run vcpu ioctls from the same thread that was used to create the
24 vcpu.
25
262. File descritpors
27
28The kvm API is centered around file descriptors. An initial
29open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
30can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
31handle will create a VM file descripror which can be used to issue VM
32ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
33and return a file descriptor pointing to it. Finally, ioctls on a vcpu
34fd can be used to control the vcpu, including the important task of
35actually running guest code.
36
37In general file descriptors can be migrated among processes by means
38of fork() and the SCM_RIGHTS facility of unix domain socket. These
39kinds of tricks are explicitly not supported by kvm. While they will
40not cause harm to the host, their actual behavior is not guaranteed by
41the API. The only supported use is one virtual machine per process,
42and one vcpu per thread.
43
443. Extensions
45
46As of Linux 2.6.22, the KVM ABI has been stabilized: no backward
47incompatible change are allowed. However, there is an extension
48facility that allows backward-compatible extensions to the API to be
49queried and used.
50
51The extension mechanism is not based on on the Linux version number.
52Instead, kvm defines extension identifiers and a facility to query
53whether a particular extension identifier is available. If it is, a
54set of ioctls is available for application use.
55
564. API description
57
58This section describes ioctls that can be used to control kvm guests.
59For each ioctl, the following information is provided along with a
60description:
61
62 Capability: which KVM extension provides this ioctl. Can be 'basic',
63 which means that is will be provided by any kernel that supports
64 API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which
65 means availability needs to be checked with KVM_CHECK_EXTENSION
66 (see section 4.4).
67
68 Architectures: which instruction set architectures provide this ioctl.
69 x86 includes both i386 and x86_64.
70
71 Type: system, vm, or vcpu.
72
73 Parameters: what parameters are accepted by the ioctl.
74
75 Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL)
76 are not detailed, but errors with specific meanings are.
77
784.1 KVM_GET_API_VERSION
79
80Capability: basic
81Architectures: all
82Type: system ioctl
83Parameters: none
84Returns: the constant KVM_API_VERSION (=12)
85
86This identifies the API version as the stable kvm API. It is not
87expected that this number will change. However, Linux 2.6.20 and
882.6.21 report earlier versions; these are not documented and not
89supported. Applications should refuse to run if KVM_GET_API_VERSION
90returns a value other than 12. If this check passes, all ioctls
91described as 'basic' will be available.
92
934.2 KVM_CREATE_VM
94
95Capability: basic
96Architectures: all
97Type: system ioctl
98Parameters: none
99Returns: a VM fd that can be used to control the new virtual machine.
100
101The new VM has no virtual cpus and no memory. An mmap() of a VM fd
102will access the virtual machine's physical address space; offset zero
103corresponds to guest physical address zero. Use of mmap() on a VM fd
104is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
105available.
106
1074.3 KVM_GET_MSR_INDEX_LIST
108
109Capability: basic
110Architectures: x86
111Type: system
112Parameters: struct kvm_msr_list (in/out)
113Returns: 0 on success; -1 on error
114Errors:
115 E2BIG: the msr index list is to be to fit in the array specified by
116 the user.
117
118struct kvm_msr_list {
119 __u32 nmsrs; /* number of msrs in entries */
120 __u32 indices[0];
121};
122
123This ioctl returns the guest msrs that are supported. The list varies
124by kvm version and host processor, but does not change otherwise. The
125user fills in the size of the indices array in nmsrs, and in return
126kvm adjusts nmsrs to reflect the actual number of msrs and fills in
127the indices array with their numbers.
128
1294.4 KVM_CHECK_EXTENSION
130
131Capability: basic
132Architectures: all
133Type: system ioctl
134Parameters: extension identifier (KVM_CAP_*)
135Returns: 0 if unsupported; 1 (or some other positive integer) if supported
136
137The API allows the application to query about extensions to the core
138kvm API. Userspace passes an extension identifier (an integer) and
139receives an integer that describes the extension availability.
140Generally 0 means no and 1 means yes, but some extensions may report
141additional information in the integer return value.
142
1434.5 KVM_GET_VCPU_MMAP_SIZE
144
145Capability: basic
146Architectures: all
147Type: system ioctl
148Parameters: none
149Returns: size of vcpu mmap area, in bytes
150
151The KVM_RUN ioctl (cf.) communicates with userspace via a shared
152memory region. This ioctl returns the size of that region. See the
153KVM_RUN documentation for details.
154
1554.6 KVM_SET_MEMORY_REGION
156
157Capability: basic
158Architectures: all
159Type: vm ioctl
160Parameters: struct kvm_memory_region (in)
161Returns: 0 on success, -1 on error
162
163struct kvm_memory_region {
164 __u32 slot;
165 __u32 flags;
166 __u64 guest_phys_addr;
167 __u64 memory_size; /* bytes */
168};
169
170/* for kvm_memory_region::flags */
171#define KVM_MEM_LOG_DIRTY_PAGES 1UL
172
173This ioctl allows the user to create or modify a guest physical memory
174slot. When changing an existing slot, it may be moved in the guest
175physical memory space, or its flags may be modified. It may not be
176resized. Slots may not overlap.
177
178The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
179instructs kvm to keep track of writes to memory within the slot. See
180the KVM_GET_DIRTY_LOG ioctl.
181
182It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
183of this API, if available. This newer API allows placing guest memory
184at specified locations in the host address space, yielding better
185control and easy access.
186
1874.6 KVM_CREATE_VCPU
188
189Capability: basic
190Architectures: all
191Type: vm ioctl
192Parameters: vcpu id (apic id on x86)
193Returns: vcpu fd on success, -1 on error
194
195This API adds a vcpu to a virtual machine. The vcpu id is a small integer
196in the range [0, max_vcpus).
197
1984.7 KVM_GET_DIRTY_LOG (vm ioctl)
199
200Capability: basic
201Architectures: x86
202Type: vm ioctl
203Parameters: struct kvm_dirty_log (in/out)
204Returns: 0 on success, -1 on error
205
206/* for KVM_GET_DIRTY_LOG */
207struct kvm_dirty_log {
208 __u32 slot;
209 __u32 padding;
210 union {
211 void __user *dirty_bitmap; /* one bit per page */
212 __u64 padding;
213 };
214};
215
216Given a memory slot, return a bitmap containing any pages dirtied
217since the last call to this ioctl. Bit 0 is the first page in the
218memory slot. Ensure the entire structure is cleared to avoid padding
219issues.
220
2214.8 KVM_SET_MEMORY_ALIAS
222
223Capability: basic
224Architectures: x86
225Type: vm ioctl
226Parameters: struct kvm_memory_alias (in)
227Returns: 0 (success), -1 (error)
228
229struct kvm_memory_alias {
230 __u32 slot; /* this has a different namespace than memory slots */
231 __u32 flags;
232 __u64 guest_phys_addr;
233 __u64 memory_size;
234 __u64 target_phys_addr;
235};
236
237Defines a guest physical address space region as an alias to another
238region. Useful for aliased address, for example the VGA low memory
239window. Should not be used with userspace memory.
240
2414.9 KVM_RUN
242
243Capability: basic
244Architectures: all
245Type: vcpu ioctl
246Parameters: none
247Returns: 0 on success, -1 on error
248Errors:
249 EINTR: an unmasked signal is pending
250
251This ioctl is used to run a guest virtual cpu. While there are no
252explicit parameters, there is an implicit parameter block that can be
253obtained by mmap()ing the vcpu fd at offset 0, with the size given by
254KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct
255kvm_run' (see below).
256
2574.10 KVM_GET_REGS
258
259Capability: basic
260Architectures: all
261Type: vcpu ioctl
262Parameters: struct kvm_regs (out)
263Returns: 0 on success, -1 on error
264
265Reads the general purpose registers from the vcpu.
266
267/* x86 */
268struct kvm_regs {
269 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
270 __u64 rax, rbx, rcx, rdx;
271 __u64 rsi, rdi, rsp, rbp;
272 __u64 r8, r9, r10, r11;
273 __u64 r12, r13, r14, r15;
274 __u64 rip, rflags;
275};
276
2774.11 KVM_SET_REGS
278
279Capability: basic
280Architectures: all
281Type: vcpu ioctl
282Parameters: struct kvm_regs (in)
283Returns: 0 on success, -1 on error
284
285Writes the general purpose registers into the vcpu.
286
287See KVM_GET_REGS for the data structure.
288
2894.12 KVM_GET_SREGS
290
291Capability: basic
292Architectures: x86
293Type: vcpu ioctl
294Parameters: struct kvm_sregs (out)
295Returns: 0 on success, -1 on error
296
297Reads special registers from the vcpu.
298
299/* x86 */
300struct kvm_sregs {
301 struct kvm_segment cs, ds, es, fs, gs, ss;
302 struct kvm_segment tr, ldt;
303 struct kvm_dtable gdt, idt;
304 __u64 cr0, cr2, cr3, cr4, cr8;
305 __u64 efer;
306 __u64 apic_base;
307 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
308};
309
310interrupt_bitmap is a bitmap of pending external interrupts. At most
311one bit may be set. This interrupt has been acknowledged by the APIC
312but not yet injected into the cpu core.
313
3144.13 KVM_SET_SREGS
315
316Capability: basic
317Architectures: x86
318Type: vcpu ioctl
319Parameters: struct kvm_sregs (in)
320Returns: 0 on success, -1 on error
321
322Writes special registers into the vcpu. See KVM_GET_SREGS for the
323data structures.
324
3254.14 KVM_TRANSLATE
326
327Capability: basic
328Architectures: x86
329Type: vcpu ioctl
330Parameters: struct kvm_translation (in/out)
331Returns: 0 on success, -1 on error
332
333Translates a virtual address according to the vcpu's current address
334translation mode.
335
336struct kvm_translation {
337 /* in */
338 __u64 linear_address;
339
340 /* out */
341 __u64 physical_address;
342 __u8 valid;
343 __u8 writeable;
344 __u8 usermode;
345 __u8 pad[5];
346};
347
3484.15 KVM_INTERRUPT
349
350Capability: basic
351Architectures: x86
352Type: vcpu ioctl
353Parameters: struct kvm_interrupt (in)
354Returns: 0 on success, -1 on error
355
356Queues a hardware interrupt vector to be injected. This is only
357useful if in-kernel local APIC is not used.
358
359/* for KVM_INTERRUPT */
360struct kvm_interrupt {
361 /* in */
362 __u32 irq;
363};
364
365Note 'irq' is an interrupt vector, not an interrupt pin or line.
366
3674.16 KVM_DEBUG_GUEST
368
369Capability: basic
370Architectures: none
371Type: vcpu ioctl
372Parameters: none)
373Returns: -1 on error
374
375Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
376
3774.17 KVM_GET_MSRS
378
379Capability: basic
380Architectures: x86
381Type: vcpu ioctl
382Parameters: struct kvm_msrs (in/out)
383Returns: 0 on success, -1 on error
384
385Reads model-specific registers from the vcpu. Supported msr indices can
386be obtained using KVM_GET_MSR_INDEX_LIST.
387
388struct kvm_msrs {
389 __u32 nmsrs; /* number of msrs in entries */
390 __u32 pad;
391
392 struct kvm_msr_entry entries[0];
393};
394
395struct kvm_msr_entry {
396 __u32 index;
397 __u32 reserved;
398 __u64 data;
399};
400
401Application code should set the 'nmsrs' member (which indicates the
402size of the entries array) and the 'index' member of each array entry.
403kvm will fill in the 'data' member.
404
4054.18 KVM_SET_MSRS
406
407Capability: basic
408Architectures: x86
409Type: vcpu ioctl
410Parameters: struct kvm_msrs (in)
411Returns: 0 on success, -1 on error
412
413Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the
414data structures.
415
416Application code should set the 'nmsrs' member (which indicates the
417size of the entries array), and the 'index' and 'data' members of each
418array entry.
419
4204.19 KVM_SET_CPUID
421
422Capability: basic
423Architectures: x86
424Type: vcpu ioctl
425Parameters: struct kvm_cpuid (in)
426Returns: 0 on success, -1 on error
427
428Defines the vcpu responses to the cpuid instruction. Applications
429should use the KVM_SET_CPUID2 ioctl if available.
430
431
432struct kvm_cpuid_entry {
433 __u32 function;
434 __u32 eax;
435 __u32 ebx;
436 __u32 ecx;
437 __u32 edx;
438 __u32 padding;
439};
440
441/* for KVM_SET_CPUID */
442struct kvm_cpuid {
443 __u32 nent;
444 __u32 padding;
445 struct kvm_cpuid_entry entries[0];
446};
447
4484.20 KVM_SET_SIGNAL_MASK
449
450Capability: basic
451Architectures: x86
452Type: vcpu ioctl
453Parameters: struct kvm_signal_mask (in)
454Returns: 0 on success, -1 on error
455
456Defines which signals are blocked during execution of KVM_RUN. This
457signal mask temporarily overrides the threads signal mask. Any
458unblocked signal received (except SIGKILL and SIGSTOP, which retain
459their traditional behaviour) will cause KVM_RUN to return with -EINTR.
460
461Note the signal will only be delivered if not blocked by the original
462signal mask.
463
464/* for KVM_SET_SIGNAL_MASK */
465struct kvm_signal_mask {
466 __u32 len;
467 __u8 sigset[0];
468};
469
4704.21 KVM_GET_FPU
471
472Capability: basic
473Architectures: x86
474Type: vcpu ioctl
475Parameters: struct kvm_fpu (out)
476Returns: 0 on success, -1 on error
477
478Reads the floating point state from the vcpu.
479
480/* for KVM_GET_FPU and KVM_SET_FPU */
481struct kvm_fpu {
482 __u8 fpr[8][16];
483 __u16 fcw;
484 __u16 fsw;
485 __u8 ftwx; /* in fxsave format */
486 __u8 pad1;
487 __u16 last_opcode;
488 __u64 last_ip;
489 __u64 last_dp;
490 __u8 xmm[16][16];
491 __u32 mxcsr;
492 __u32 pad2;
493};
494
4954.22 KVM_SET_FPU
496
497Capability: basic
498Architectures: x86
499Type: vcpu ioctl
500Parameters: struct kvm_fpu (in)
501Returns: 0 on success, -1 on error
502
503Writes the floating point state to the vcpu.
504
505/* for KVM_GET_FPU and KVM_SET_FPU */
506struct kvm_fpu {
507 __u8 fpr[8][16];
508 __u16 fcw;
509 __u16 fsw;
510 __u8 ftwx; /* in fxsave format */
511 __u8 pad1;
512 __u16 last_opcode;
513 __u64 last_ip;
514 __u64 last_dp;
515 __u8 xmm[16][16];
516 __u32 mxcsr;
517 __u32 pad2;
518};
519
5204.23 KVM_CREATE_IRQCHIP
521
522Capability: KVM_CAP_IRQCHIP
523Architectures: x86, ia64
524Type: vm ioctl
525Parameters: none
526Returns: 0 on success, -1 on error
527
528Creates an interrupt controller model in the kernel. On x86, creates a virtual
529ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
530local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
531only go to the IOAPIC. On ia64, a IOSAPIC is created.
532
5334.24 KVM_IRQ_LINE
534
535Capability: KVM_CAP_IRQCHIP
536Architectures: x86, ia64
537Type: vm ioctl
538Parameters: struct kvm_irq_level
539Returns: 0 on success, -1 on error
540
541Sets the level of a GSI input to the interrupt controller model in the kernel.
542Requires that an interrupt controller model has been previously created with
543KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level
544to be set to 1 and then back to 0.
545
546struct kvm_irq_level {
547 union {
548 __u32 irq; /* GSI */
549 __s32 status; /* not used for KVM_IRQ_LEVEL */
550 };
551 __u32 level; /* 0 or 1 */
552};
553
5544.25 KVM_GET_IRQCHIP
555
556Capability: KVM_CAP_IRQCHIP
557Architectures: x86, ia64
558Type: vm ioctl
559Parameters: struct kvm_irqchip (in/out)
560Returns: 0 on success, -1 on error
561
562Reads the state of a kernel interrupt controller created with
563KVM_CREATE_IRQCHIP into a buffer provided by the caller.
564
565struct kvm_irqchip {
566 __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
567 __u32 pad;
568 union {
569 char dummy[512]; /* reserving space */
570 struct kvm_pic_state pic;
571 struct kvm_ioapic_state ioapic;
572 } chip;
573};
574
5754.26 KVM_SET_IRQCHIP
576
577Capability: KVM_CAP_IRQCHIP
578Architectures: x86, ia64
579Type: vm ioctl
580Parameters: struct kvm_irqchip (in)
581Returns: 0 on success, -1 on error
582
583Sets the state of a kernel interrupt controller created with
584KVM_CREATE_IRQCHIP from a buffer provided by the caller.
585
586struct kvm_irqchip {
587 __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
588 __u32 pad;
589 union {
590 char dummy[512]; /* reserving space */
591 struct kvm_pic_state pic;
592 struct kvm_ioapic_state ioapic;
593 } chip;
594};
595
5965. The kvm_run structure
597
598Application code obtains a pointer to the kvm_run structure by
599mmap()ing a vcpu fd. From that point, application code can control
600execution by changing fields in kvm_run prior to calling the KVM_RUN
601ioctl, and obtain information about the reason KVM_RUN returned by
602looking up structure members.
603
604struct kvm_run {
605 /* in */
606 __u8 request_interrupt_window;
607
608Request that KVM_RUN return when it becomes possible to inject external
609interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
610
611 __u8 padding1[7];
612
613 /* out */
614 __u32 exit_reason;
615
616When KVM_RUN has returned successfully (return value 0), this informs
617application code why KVM_RUN has returned. Allowable values for this
618field are detailed below.
619
620 __u8 ready_for_interrupt_injection;
621
622If request_interrupt_window has been specified, this field indicates
623an interrupt can be injected now with KVM_INTERRUPT.
624
625 __u8 if_flag;
626
627The value of the current interrupt flag. Only valid if in-kernel
628local APIC is not used.
629
630 __u8 padding2[2];
631
632 /* in (pre_kvm_run), out (post_kvm_run) */
633 __u64 cr8;
634
635The value of the cr8 register. Only valid if in-kernel local APIC is
636not used. Both input and output.
637
638 __u64 apic_base;
639
640The value of the APIC BASE msr. Only valid if in-kernel local
641APIC is not used. Both input and output.
642
643 union {
644 /* KVM_EXIT_UNKNOWN */
645 struct {
646 __u64 hardware_exit_reason;
647 } hw;
648
649If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown
650reasons. Further architecture-specific information is available in
651hardware_exit_reason.
652
653 /* KVM_EXIT_FAIL_ENTRY */
654 struct {
655 __u64 hardware_entry_failure_reason;
656 } fail_entry;
657
658If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
659to unknown reasons. Further architecture-specific information is
660available in hardware_entry_failure_reason.
661
662 /* KVM_EXIT_EXCEPTION */
663 struct {
664 __u32 exception;
665 __u32 error_code;
666 } ex;
667
668Unused.
669
670 /* KVM_EXIT_IO */
671 struct {
672#define KVM_EXIT_IO_IN 0
673#define KVM_EXIT_IO_OUT 1
674 __u8 direction;
675 __u8 size; /* bytes */
676 __u16 port;
677 __u32 count;
678 __u64 data_offset; /* relative to kvm_run start */
679 } io;
680
681If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has
682executed a port I/O instruction which could not be satisfied by kvm.
683data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
684where kvm expects application code to place the data for the next
685KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array.
686
687 struct {
688 struct kvm_debug_exit_arch arch;
689 } debug;
690
691Unused.
692
693 /* KVM_EXIT_MMIO */
694 struct {
695 __u64 phys_addr;
696 __u8 data[8];
697 __u32 len;
698 __u8 is_write;
699 } mmio;
700
701If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has
702executed a memory-mapped I/O instruction which could not be satisfied
703by kvm. The 'data' member contains the written data if 'is_write' is
704true, and should be filled by application code otherwise.
705
706 /* KVM_EXIT_HYPERCALL */
707 struct {
708 __u64 nr;
709 __u64 args[6];
710 __u64 ret;
711 __u32 longmode;
712 __u32 pad;
713 } hypercall;
714
715Unused.
716
717 /* KVM_EXIT_TPR_ACCESS */
718 struct {
719 __u64 rip;
720 __u32 is_write;
721 __u32 pad;
722 } tpr_access;
723
724To be documented (KVM_TPR_ACCESS_REPORTING).
725
726 /* KVM_EXIT_S390_SIEIC */
727 struct {
728 __u8 icptcode;
729 __u64 mask; /* psw upper half */
730 __u64 addr; /* psw lower half */
731 __u16 ipa;
732 __u32 ipb;
733 } s390_sieic;
734
735s390 specific.
736
737 /* KVM_EXIT_S390_RESET */
738#define KVM_S390_RESET_POR 1
739#define KVM_S390_RESET_CLEAR 2
740#define KVM_S390_RESET_SUBSYSTEM 4
741#define KVM_S390_RESET_CPU_INIT 8
742#define KVM_S390_RESET_IPL 16
743 __u64 s390_reset_flags;
744
745s390 specific.
746
747 /* KVM_EXIT_DCR */
748 struct {
749 __u32 dcrn;
750 __u32 data;
751 __u8 is_write;
752 } dcr;
753
754powerpc specific.
755
756 /* Fix the size of the union. */
757 char padding[256];
758 };
759};
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 78e354b42f67..e2ddcdeb61b6 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -36,8 +36,6 @@ detailed description):
36 - Bluetooth enable and disable 36 - Bluetooth enable and disable
37 - video output switching, expansion control 37 - video output switching, expansion control
38 - ThinkLight on and off 38 - ThinkLight on and off
39 - limited docking and undocking
40 - UltraBay eject
41 - CMOS/UCMS control 39 - CMOS/UCMS control
42 - LED control 40 - LED control
43 - ACPI sounds 41 - ACPI sounds
@@ -729,131 +727,6 @@ cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
729It is impossible to know if the status returned through sysfs is valid. 727It is impossible to know if the status returned through sysfs is valid.
730 728
731 729
732Docking / undocking -- /proc/acpi/ibm/dock
733------------------------------------------
734
735Docking and undocking (e.g. with the X4 UltraBase) requires some
736actions to be taken by the operating system to safely make or break
737the electrical connections with the dock.
738
739The docking feature of this driver generates the following ACPI events:
740
741 ibm/dock GDCK 00000003 00000001 -- eject request
742 ibm/dock GDCK 00000003 00000002 -- undocked
743 ibm/dock GDCK 00000000 00000003 -- docked
744
745NOTE: These events will only be generated if the laptop was docked
746when originally booted. This is due to the current lack of support for
747hot plugging of devices in the Linux ACPI framework. If the laptop was
748booted while not in the dock, the following message is shown in the
749logs:
750
751 Mar 17 01:42:34 aero kernel: thinkpad_acpi: dock device not present
752
753In this case, no dock-related events are generated but the dock and
754undock commands described below still work. They can be executed
755manually or triggered by Fn key combinations (see the example acpid
756configuration files included in the driver tarball package available
757on the web site).
758
759When the eject request button on the dock is pressed, the first event
760above is generated. The handler for this event should issue the
761following command:
762
763 echo undock > /proc/acpi/ibm/dock
764
765After the LED on the dock goes off, it is safe to eject the laptop.
766Note: if you pressed this key by mistake, go ahead and eject the
767laptop, then dock it back in. Otherwise, the dock may not function as
768expected.
769
770When the laptop is docked, the third event above is generated. The
771handler for this event should issue the following command to fully
772enable the dock:
773
774 echo dock > /proc/acpi/ibm/dock
775
776The contents of the /proc/acpi/ibm/dock file shows the current status
777of the dock, as provided by the ACPI framework.
778
779The docking support in this driver does not take care of enabling or
780disabling any other devices you may have attached to the dock. For
781example, a CD drive plugged into the UltraBase needs to be disabled or
782enabled separately. See the provided example acpid configuration files
783for how this can be accomplished.
784
785There is no support yet for PCI devices that may be attached to a
786docking station, e.g. in the ThinkPad Dock II. The driver currently
787does not recognize, enable or disable such devices. This means that
788the only docking stations currently supported are the X-series
789UltraBase docks and "dumb" port replicators like the Mini Dock (the
790latter don't need any ACPI support, actually).
791
792
793UltraBay eject -- /proc/acpi/ibm/bay
794------------------------------------
795
796Inserting or ejecting an UltraBay device requires some actions to be
797taken by the operating system to safely make or break the electrical
798connections with the device.
799
800This feature generates the following ACPI events:
801
802 ibm/bay MSTR 00000003 00000000 -- eject request
803 ibm/bay MSTR 00000001 00000000 -- eject lever inserted
804
805NOTE: These events will only be generated if the UltraBay was present
806when the laptop was originally booted (on the X series, the UltraBay
807is in the dock, so it may not be present if the laptop was undocked).
808This is due to the current lack of support for hot plugging of devices
809in the Linux ACPI framework. If the laptop was booted without the
810UltraBay, the following message is shown in the logs:
811
812 Mar 17 01:42:34 aero kernel: thinkpad_acpi: bay device not present
813
814In this case, no bay-related events are generated but the eject
815command described below still works. It can be executed manually or
816triggered by a hot key combination.
817
818Sliding the eject lever generates the first event shown above. The
819handler for this event should take whatever actions are necessary to
820shut down the device in the UltraBay (e.g. call idectl), then issue
821the following command:
822
823 echo eject > /proc/acpi/ibm/bay
824
825After the LED on the UltraBay goes off, it is safe to pull out the
826device.
827
828When the eject lever is inserted, the second event above is
829generated. The handler for this event should take whatever actions are
830necessary to enable the UltraBay device (e.g. call idectl).
831
832The contents of the /proc/acpi/ibm/bay file shows the current status
833of the UltraBay, as provided by the ACPI framework.
834
835EXPERIMENTAL warm eject support on the 600e/x, A22p and A3x (To use
836this feature, you need to supply the experimental=1 parameter when
837loading the module):
838
839These models do not have a button near the UltraBay device to request
840a hot eject but rather require the laptop to be put to sleep
841(suspend-to-ram) before the bay device is ejected or inserted).
842The sequence of steps to eject the device is as follows:
843
844 echo eject > /proc/acpi/ibm/bay
845 put the ThinkPad to sleep
846 remove the drive
847 resume from sleep
848 cat /proc/acpi/ibm/bay should show that the drive was removed
849
850On the A3x, both the UltraBay 2000 and UltraBay Plus devices are
851supported. Use "eject2" instead of "eject" for the second bay.
852
853Note: the UltraBay eject support on the 600e/x, A22p and A3x is
854EXPERIMENTAL and may not work as expected. USE WITH CAUTION!
855
856
857CMOS/UCMS control 730CMOS/UCMS control
858----------------- 731-----------------
859 732
@@ -920,7 +793,7 @@ The available commands are:
920 echo '<LED number> off' >/proc/acpi/ibm/led 793 echo '<LED number> off' >/proc/acpi/ibm/led
921 echo '<LED number> blink' >/proc/acpi/ibm/led 794 echo '<LED number> blink' >/proc/acpi/ibm/led
922 795
923The <LED number> range is 0 to 7. The set of LEDs that can be 796The <LED number> range is 0 to 15. The set of LEDs that can be
924controlled varies from model to model. Here is the common ThinkPad 797controlled varies from model to model. Here is the common ThinkPad
925mapping: 798mapping:
926 799
@@ -932,6 +805,11 @@ mapping:
932 5 - UltraBase battery slot 805 5 - UltraBase battery slot
933 6 - (unknown) 806 6 - (unknown)
934 7 - standby 807 7 - standby
808 8 - dock status 1
809 9 - dock status 2
810 10, 11 - (unknown)
811 12 - thinkvantage
812 13, 14, 15 - (unknown)
935 813
936All of the above can be turned on and off and can be made to blink. 814All of the above can be turned on and off and can be made to blink.
937 815
@@ -940,10 +818,12 @@ sysfs notes:
940The ThinkPad LED sysfs interface is described in detail by the LED class 818The ThinkPad LED sysfs interface is described in detail by the LED class
941documentation, in Documentation/leds-class.txt. 819documentation, in Documentation/leds-class.txt.
942 820
943The leds are named (in LED ID order, from 0 to 7): 821The LEDs are named (in LED ID order, from 0 to 12):
944"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt", 822"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
945"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt", 823"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
946"tpacpi::unknown_led", "tpacpi::standby". 824"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
825"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
826"tpacpi::thinkvantage".
947 827
948Due to limitations in the sysfs LED class, if the status of the LED 828Due to limitations in the sysfs LED class, if the status of the LED
949indicators cannot be read due to an error, thinkpad-acpi will report it as 829indicators cannot be read due to an error, thinkpad-acpi will report it as
@@ -958,6 +838,12 @@ ThinkPad indicator LED should blink in hardware accelerated mode, use the
958"timer" trigger, and leave the delay_on and delay_off parameters set to 838"timer" trigger, and leave the delay_on and delay_off parameters set to
959zero (to request hardware acceleration autodetection). 839zero (to request hardware acceleration autodetection).
960 840
841LEDs that are known not to exist in a given ThinkPad model are not
842made available through the sysfs interface. If you have a dock and you
843notice there are LEDs listed for your ThinkPad that do not exist (and
844are not in the dock), or if you notice that there are missing LEDs,
845a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
846
961 847
962ACPI sounds -- /proc/acpi/ibm/beep 848ACPI sounds -- /proc/acpi/ibm/beep
963---------------------------------- 849----------------------------------
@@ -1156,17 +1042,19 @@ may not be distinct. Later Lenovo models that implement the ACPI
1156display backlight brightness control methods have 16 levels, ranging 1042display backlight brightness control methods have 16 levels, ranging
1157from 0 to 15. 1043from 0 to 15.
1158 1044
1159There are two interfaces to the firmware for direct brightness control, 1045For IBM ThinkPads, there are two interfaces to the firmware for direct
1160EC and UCMS (or CMOS). To select which one should be used, use the 1046brightness control, EC and UCMS (or CMOS). To select which one should be
1161brightness_mode module parameter: brightness_mode=1 selects EC mode, 1047used, use the brightness_mode module parameter: brightness_mode=1 selects
1162brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC 1048EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
1163mode with NVRAM backing (so that brightness changes are remembered 1049mode with NVRAM backing (so that brightness changes are remembered across
1164across shutdown/reboot). 1050shutdown/reboot).
1165 1051
1166The driver tries to select which interface to use from a table of 1052The driver tries to select which interface to use from a table of
1167defaults for each ThinkPad model. If it makes a wrong choice, please 1053defaults for each ThinkPad model. If it makes a wrong choice, please
1168report this as a bug, so that we can fix it. 1054report this as a bug, so that we can fix it.
1169 1055
1056Lenovo ThinkPads only support brightness_mode=2 (UCMS).
1057
1170When display backlight brightness controls are available through the 1058When display backlight brightness controls are available through the
1171standard ACPI interface, it is best to use it instead of this direct 1059standard ACPI interface, it is best to use it instead of this direct
1172ThinkPad-specific interface. The driver will disable its native 1060ThinkPad-specific interface. The driver will disable its native
@@ -1254,7 +1142,7 @@ Fan control and monitoring: fan speed, fan enable/disable
1254 1142
1255procfs: /proc/acpi/ibm/fan 1143procfs: /proc/acpi/ibm/fan
1256sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, 1144sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
1257 pwm1_enable 1145 pwm1_enable, fan2_input
1258sysfs hwmon driver attributes: fan_watchdog 1146sysfs hwmon driver attributes: fan_watchdog
1259 1147
1260NOTE NOTE NOTE: fan control operations are disabled by default for 1148NOTE NOTE NOTE: fan control operations are disabled by default for
@@ -1267,6 +1155,9 @@ from the hardware registers of the embedded controller. This is known
1267to work on later R, T, X and Z series ThinkPads but may show a bogus 1155to work on later R, T, X and Z series ThinkPads but may show a bogus
1268value on other models. 1156value on other models.
1269 1157
1158Some Lenovo ThinkPads support a secondary fan. This fan cannot be
1159controlled separately, it shares the main fan control.
1160
1270Fan levels: 1161Fan levels:
1271 1162
1272Most ThinkPad fans work in "levels" at the firmware interface. Level 0 1163Most ThinkPad fans work in "levels" at the firmware interface. Level 0
@@ -1397,6 +1288,11 @@ hwmon device attribute fan1_input:
1397 which can take up to two minutes. May return rubbish on older 1288 which can take up to two minutes. May return rubbish on older
1398 ThinkPads. 1289 ThinkPads.
1399 1290
1291hwmon device attribute fan2_input:
1292 Fan tachometer reading, in RPM, for the secondary fan.
1293 Available only on some ThinkPads. If the secondary fan is
1294 not installed, will always read 0.
1295
1400hwmon driver attribute fan_watchdog: 1296hwmon driver attribute fan_watchdog:
1401 Fan safety watchdog timer interval, in seconds. Minimum is 1297 Fan safety watchdog timer interval, in seconds. Minimum is
1402 1 second, maximum is 120 seconds. 0 disables the watchdog. 1298 1 second, maximum is 120 seconds. 0 disables the watchdog.
@@ -1555,3 +1451,7 @@ Sysfs interface changelog:
15550x020300: hotkey enable/disable support removed, attributes 14510x020300: hotkey enable/disable support removed, attributes
1556 hotkey_bios_enabled and hotkey_enable deprecated and 1452 hotkey_bios_enabled and hotkey_enable deprecated and
1557 marked for removal. 1453 marked for removal.
1454
14550x020400: Marker for 16 LEDs support. Also, LEDs that are known
1456 to not exist in a given model are not registered with
1457 the LED sysfs class anymore.
diff --git a/Documentation/leds-lp3944.txt b/Documentation/leds-lp3944.txt
new file mode 100644
index 000000000000..c6eda18b15ef
--- /dev/null
+++ b/Documentation/leds-lp3944.txt
@@ -0,0 +1,50 @@
1Kernel driver lp3944
2====================
3
4 * National Semiconductor LP3944 Fun-light Chip
5 Prefix: 'lp3944'
6 Addresses scanned: None (see the Notes section below)
7 Datasheet: Publicly available at the National Semiconductor website
8 http://www.national.com/pf/LP/LP3944.html
9
10Authors:
11 Antonio Ospite <ospite@studenti.unina.it>
12
13
14Description
15-----------
16The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
17DIM modes; it could even be used as a gpio expander but this driver assumes it
18is used as a led controller.
19
20The DIM modes are used to set _blink_ patterns for leds, the pattern is
21specified supplying two parameters:
22 - period: from 0s to 1.6s
23 - duty cycle: percentage of the period the led is on, from 0 to 100
24
25Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
26See the datasheet for details.
27
28LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
29leds, the camera flash light and the lcds power.
30
31
32Notes
33-----
34The chip is used mainly in embedded contexts, so this driver expects it is
35registered using the i2c_board_info mechanism.
36
37To register the chip at address 0x60 on adapter 0, set the platform data
38according to include/linux/leds-lp3944.h, set the i2c board info:
39
40 static struct i2c_board_info __initdata a910_i2c_board_info[] = {
41 {
42 I2C_BOARD_INFO("lp3944", 0x60),
43 .platform_data = &a910_lp3944_leds,
44 },
45 };
46
47and register it in the platform init function
48
49 i2c_register_board_info(0, a910_i2c_board_info,
50 ARRAY_SIZE(a910_i2c_board_info));
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 9ebcd6ef361b..950cde6d6e58 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,7 +1,9 @@
1/*P:100 This is the Launcher code, a simple program which lays out the 1/*P:100
2 * "physical" memory for the new Guest by mapping the kernel image and 2 * This is the Launcher code, a simple program which lays out the "physical"
3 * the virtual devices, then opens /dev/lguest to tell the kernel 3 * memory for the new Guest by mapping the kernel image and the virtual
4 * about the Guest and control it. :*/ 4 * devices, then opens /dev/lguest to tell the kernel about the Guest and
5 * control it.
6:*/
5#define _LARGEFILE64_SOURCE 7#define _LARGEFILE64_SOURCE
6#define _GNU_SOURCE 8#define _GNU_SOURCE
7#include <stdio.h> 9#include <stdio.h>
@@ -46,13 +48,15 @@
46#include "linux/virtio_rng.h" 48#include "linux/virtio_rng.h"
47#include "linux/virtio_ring.h" 49#include "linux/virtio_ring.h"
48#include "asm/bootparam.h" 50#include "asm/bootparam.h"
49/*L:110 We can ignore the 39 include files we need for this program, but I do 51/*L:110
50 * want to draw attention to the use of kernel-style types. 52 * We can ignore the 42 include files we need for this program, but I do want
53 * to draw attention to the use of kernel-style types.
51 * 54 *
52 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 * As Linus said, "C is a Spartan language, and so should your naming be." I
53 * like these abbreviations, so we define them here. Note that u64 is always 56 * like these abbreviations, so we define them here. Note that u64 is always
54 * unsigned long long, which works on all Linux systems: this means that we can 57 * unsigned long long, which works on all Linux systems: this means that we can
55 * use %llu in printf for any u64. */ 58 * use %llu in printf for any u64.
59 */
56typedef unsigned long long u64; 60typedef unsigned long long u64;
57typedef uint32_t u32; 61typedef uint32_t u32;
58typedef uint16_t u16; 62typedef uint16_t u16;
@@ -69,8 +73,10 @@ typedef uint8_t u8;
69/* This will occupy 3 pages: it must be a power of 2. */ 73/* This will occupy 3 pages: it must be a power of 2. */
70#define VIRTQUEUE_NUM 256 74#define VIRTQUEUE_NUM 256
71 75
72/*L:120 verbose is both a global flag and a macro. The C preprocessor allows 76/*L:120
73 * this, and although I wouldn't recommend it, it works quite nicely here. */ 77 * verbose is both a global flag and a macro. The C preprocessor allows
78 * this, and although I wouldn't recommend it, it works quite nicely here.
79 */
74static bool verbose; 80static bool verbose;
75#define verbose(args...) \ 81#define verbose(args...) \
76 do { if (verbose) printf(args); } while(0) 82 do { if (verbose) printf(args); } while(0)
@@ -87,8 +93,7 @@ static int lguest_fd;
87static unsigned int __thread cpu_id; 93static unsigned int __thread cpu_id;
88 94
89/* This is our list of devices. */ 95/* This is our list of devices. */
90struct device_list 96struct device_list {
91{
92 /* Counter to assign interrupt numbers. */ 97 /* Counter to assign interrupt numbers. */
93 unsigned int next_irq; 98 unsigned int next_irq;
94 99
@@ -100,8 +105,7 @@ struct device_list
100 105
101 /* A single linked list of devices. */ 106 /* A single linked list of devices. */
102 struct device *dev; 107 struct device *dev;
103 /* And a pointer to the last device for easy append and also for 108 /* And a pointer to the last device for easy append. */
104 * configuration appending. */
105 struct device *lastdev; 109 struct device *lastdev;
106}; 110};
107 111
@@ -109,8 +113,7 @@ struct device_list
109static struct device_list devices; 113static struct device_list devices;
110 114
111/* The device structure describes a single device. */ 115/* The device structure describes a single device. */
112struct device 116struct device {
113{
114 /* The linked-list pointer. */ 117 /* The linked-list pointer. */
115 struct device *next; 118 struct device *next;
116 119
@@ -135,8 +138,7 @@ struct device
135}; 138};
136 139
137/* The virtqueue structure describes a queue attached to a device. */ 140/* The virtqueue structure describes a queue attached to a device. */
138struct virtqueue 141struct virtqueue {
139{
140 struct virtqueue *next; 142 struct virtqueue *next;
141 143
142 /* Which device owns me. */ 144 /* Which device owns me. */
@@ -168,20 +170,24 @@ static char **main_args;
168/* The original tty settings to restore on exit. */ 170/* The original tty settings to restore on exit. */
169static struct termios orig_term; 171static struct termios orig_term;
170 172
171/* We have to be careful with barriers: our devices are all run in separate 173/*
174 * We have to be careful with barriers: our devices are all run in separate
172 * threads and so we need to make sure that changes visible to the Guest happen 175 * threads and so we need to make sure that changes visible to the Guest happen
173 * in precise order. */ 176 * in precise order.
177 */
174#define wmb() __asm__ __volatile__("" : : : "memory") 178#define wmb() __asm__ __volatile__("" : : : "memory")
175#define mb() __asm__ __volatile__("" : : : "memory") 179#define mb() __asm__ __volatile__("" : : : "memory")
176 180
177/* Convert an iovec element to the given type. 181/*
182 * Convert an iovec element to the given type.
178 * 183 *
179 * This is a fairly ugly trick: we need to know the size of the type and 184 * This is a fairly ugly trick: we need to know the size of the type and
180 * alignment requirement to check the pointer is kosher. It's also nice to 185 * alignment requirement to check the pointer is kosher. It's also nice to
181 * have the name of the type in case we report failure. 186 * have the name of the type in case we report failure.
182 * 187 *
183 * Typing those three things all the time is cumbersome and error prone, so we 188 * Typing those three things all the time is cumbersome and error prone, so we
184 * have a macro which sets them all up and passes to the real function. */ 189 * have a macro which sets them all up and passes to the real function.
190 */
185#define convert(iov, type) \ 191#define convert(iov, type) \
186 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 192 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
187 193
@@ -198,8 +204,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
198/* Wrapper for the last available index. Makes it easier to change. */ 204/* Wrapper for the last available index. Makes it easier to change. */
199#define lg_last_avail(vq) ((vq)->last_avail_idx) 205#define lg_last_avail(vq) ((vq)->last_avail_idx)
200 206
201/* The virtio configuration space is defined to be little-endian. x86 is 207/*
202 * little-endian too, but it's nice to be explicit so we have these helpers. */ 208 * The virtio configuration space is defined to be little-endian. x86 is
209 * little-endian too, but it's nice to be explicit so we have these helpers.
210 */
203#define cpu_to_le16(v16) (v16) 211#define cpu_to_le16(v16) (v16)
204#define cpu_to_le32(v32) (v32) 212#define cpu_to_le32(v32) (v32)
205#define cpu_to_le64(v64) (v64) 213#define cpu_to_le64(v64) (v64)
@@ -241,11 +249,12 @@ static u8 *get_feature_bits(struct device *dev)
241 + dev->num_vq * sizeof(struct lguest_vqconfig); 249 + dev->num_vq * sizeof(struct lguest_vqconfig);
242} 250}
243 251
244/*L:100 The Launcher code itself takes us out into userspace, that scary place 252/*L:100
245 * where pointers run wild and free! Unfortunately, like most userspace 253 * The Launcher code itself takes us out into userspace, that scary place where
246 * programs, it's quite boring (which is why everyone likes to hack on the 254 * pointers run wild and free! Unfortunately, like most userspace programs,
247 * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 255 * it's quite boring (which is why everyone likes to hack on the kernel!).
248 * will get you through this section. Or, maybe not. 256 * Perhaps if you make up an Lguest Drinking Game at this point, it will get
257 * you through this section. Or, maybe not.
249 * 258 *
250 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 259 * The Launcher sets up a big chunk of memory to be the Guest's "physical"
251 * memory and stores it in "guest_base". In other words, Guest physical == 260 * memory and stores it in "guest_base". In other words, Guest physical ==
@@ -253,7 +262,8 @@ static u8 *get_feature_bits(struct device *dev)
253 * 262 *
254 * This can be tough to get your head around, but usually it just means that we 263 * This can be tough to get your head around, but usually it just means that we
255 * use these trivial conversion functions when the Guest gives us it's 264 * use these trivial conversion functions when the Guest gives us it's
256 * "physical" addresses: */ 265 * "physical" addresses:
266 */
257static void *from_guest_phys(unsigned long addr) 267static void *from_guest_phys(unsigned long addr)
258{ 268{
259 return guest_base + addr; 269 return guest_base + addr;
@@ -268,7 +278,8 @@ static unsigned long to_guest_phys(const void *addr)
268 * Loading the Kernel. 278 * Loading the Kernel.
269 * 279 *
270 * We start with couple of simple helper routines. open_or_die() avoids 280 * We start with couple of simple helper routines. open_or_die() avoids
271 * error-checking code cluttering the callers: */ 281 * error-checking code cluttering the callers:
282 */
272static int open_or_die(const char *name, int flags) 283static int open_or_die(const char *name, int flags)
273{ 284{
274 int fd = open(name, flags); 285 int fd = open(name, flags);
@@ -283,12 +294,19 @@ static void *map_zeroed_pages(unsigned int num)
283 int fd = open_or_die("/dev/zero", O_RDONLY); 294 int fd = open_or_die("/dev/zero", O_RDONLY);
284 void *addr; 295 void *addr;
285 296
286 /* We use a private mapping (ie. if we write to the page, it will be 297 /*
287 * copied). */ 298 * We use a private mapping (ie. if we write to the page, it will be
299 * copied).
300 */
288 addr = mmap(NULL, getpagesize() * num, 301 addr = mmap(NULL, getpagesize() * num,
289 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 302 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
290 if (addr == MAP_FAILED) 303 if (addr == MAP_FAILED)
291 err(1, "Mmaping %u pages of /dev/zero", num); 304 err(1, "Mmaping %u pages of /dev/zero", num);
305
306 /*
307 * One neat mmap feature is that you can close the fd, and it
308 * stays mapped.
309 */
292 close(fd); 310 close(fd);
293 311
294 return addr; 312 return addr;
@@ -305,20 +323,24 @@ static void *get_pages(unsigned int num)
305 return addr; 323 return addr;
306} 324}
307 325
308/* This routine is used to load the kernel or initrd. It tries mmap, but if 326/*
327 * This routine is used to load the kernel or initrd. It tries mmap, but if
309 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 328 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
310 * it falls back to reading the memory in. */ 329 * it falls back to reading the memory in.
330 */
311static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 331static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
312{ 332{
313 ssize_t r; 333 ssize_t r;
314 334
315 /* We map writable even though for some segments are marked read-only. 335 /*
336 * We map writable even though for some segments are marked read-only.
316 * The kernel really wants to be writable: it patches its own 337 * The kernel really wants to be writable: it patches its own
317 * instructions. 338 * instructions.
318 * 339 *
319 * MAP_PRIVATE means that the page won't be copied until a write is 340 * MAP_PRIVATE means that the page won't be copied until a write is
320 * done to it. This allows us to share untouched memory between 341 * done to it. This allows us to share untouched memory between
321 * Guests. */ 342 * Guests.
343 */
322 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 344 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
323 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 345 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
324 return; 346 return;
@@ -329,7 +351,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
329 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 351 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
330} 352}
331 353
332/* This routine takes an open vmlinux image, which is in ELF, and maps it into 354/*
355 * This routine takes an open vmlinux image, which is in ELF, and maps it into
333 * the Guest memory. ELF = Embedded Linking Format, which is the format used 356 * the Guest memory. ELF = Embedded Linking Format, which is the format used
334 * by all modern binaries on Linux including the kernel. 357 * by all modern binaries on Linux including the kernel.
335 * 358 *
@@ -337,23 +360,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
337 * address. We use the physical address; the Guest will map itself to the 360 * address. We use the physical address; the Guest will map itself to the
338 * virtual address. 361 * virtual address.
339 * 362 *
340 * We return the starting address. */ 363 * We return the starting address.
364 */
341static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 365static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
342{ 366{
343 Elf32_Phdr phdr[ehdr->e_phnum]; 367 Elf32_Phdr phdr[ehdr->e_phnum];
344 unsigned int i; 368 unsigned int i;
345 369
346 /* Sanity checks on the main ELF header: an x86 executable with a 370 /*
347 * reasonable number of correctly-sized program headers. */ 371 * Sanity checks on the main ELF header: an x86 executable with a
372 * reasonable number of correctly-sized program headers.
373 */
348 if (ehdr->e_type != ET_EXEC 374 if (ehdr->e_type != ET_EXEC
349 || ehdr->e_machine != EM_386 375 || ehdr->e_machine != EM_386
350 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 376 || ehdr->e_phentsize != sizeof(Elf32_Phdr)
351 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 377 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
352 errx(1, "Malformed elf header"); 378 errx(1, "Malformed elf header");
353 379
354 /* An ELF executable contains an ELF header and a number of "program" 380 /*
381 * An ELF executable contains an ELF header and a number of "program"
355 * headers which indicate which parts ("segments") of the program to 382 * headers which indicate which parts ("segments") of the program to
356 * load where. */ 383 * load where.
384 */
357 385
358 /* We read in all the program headers at once: */ 386 /* We read in all the program headers at once: */
359 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 387 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
@@ -361,8 +389,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
361 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 389 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
362 err(1, "Reading program headers"); 390 err(1, "Reading program headers");
363 391
364 /* Try all the headers: there are usually only three. A read-only one, 392 /*
365 * a read-write one, and a "note" section which we don't load. */ 393 * Try all the headers: there are usually only three. A read-only one,
394 * a read-write one, and a "note" section which we don't load.
395 */
366 for (i = 0; i < ehdr->e_phnum; i++) { 396 for (i = 0; i < ehdr->e_phnum; i++) {
367 /* If this isn't a loadable segment, we ignore it */ 397 /* If this isn't a loadable segment, we ignore it */
368 if (phdr[i].p_type != PT_LOAD) 398 if (phdr[i].p_type != PT_LOAD)
@@ -380,13 +410,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
380 return ehdr->e_entry; 410 return ehdr->e_entry;
381} 411}
382 412
383/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 413/*L:150
384 * supposed to jump into it and it will unpack itself. We used to have to 414 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
385 * perform some hairy magic because the unpacking code scared me. 415 * to jump into it and it will unpack itself. We used to have to perform some
416 * hairy magic because the unpacking code scared me.
386 * 417 *
387 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 418 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
388 * a small patch to jump over the tricky bits in the Guest, so now we just read 419 * a small patch to jump over the tricky bits in the Guest, so now we just read
389 * the funky header so we know where in the file to load, and away we go! */ 420 * the funky header so we know where in the file to load, and away we go!
421 */
390static unsigned long load_bzimage(int fd) 422static unsigned long load_bzimage(int fd)
391{ 423{
392 struct boot_params boot; 424 struct boot_params boot;
@@ -394,8 +426,10 @@ static unsigned long load_bzimage(int fd)
394 /* Modern bzImages get loaded at 1M. */ 426 /* Modern bzImages get loaded at 1M. */
395 void *p = from_guest_phys(0x100000); 427 void *p = from_guest_phys(0x100000);
396 428
397 /* Go back to the start of the file and read the header. It should be 429 /*
398 * a Linux boot header (see Documentation/x86/i386/boot.txt) */ 430 * Go back to the start of the file and read the header. It should be
431 * a Linux boot header (see Documentation/x86/i386/boot.txt)
432 */
399 lseek(fd, 0, SEEK_SET); 433 lseek(fd, 0, SEEK_SET);
400 read(fd, &boot, sizeof(boot)); 434 read(fd, &boot, sizeof(boot));
401 435
@@ -414,9 +448,11 @@ static unsigned long load_bzimage(int fd)
414 return boot.hdr.code32_start; 448 return boot.hdr.code32_start;
415} 449}
416 450
417/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 451/*L:140
452 * Loading the kernel is easy when it's a "vmlinux", but most kernels
418 * come wrapped up in the self-decompressing "bzImage" format. With a little 453 * come wrapped up in the self-decompressing "bzImage" format. With a little
419 * work, we can load those, too. */ 454 * work, we can load those, too.
455 */
420static unsigned long load_kernel(int fd) 456static unsigned long load_kernel(int fd)
421{ 457{
422 Elf32_Ehdr hdr; 458 Elf32_Ehdr hdr;
@@ -433,24 +469,28 @@ static unsigned long load_kernel(int fd)
433 return load_bzimage(fd); 469 return load_bzimage(fd);
434} 470}
435 471
436/* This is a trivial little helper to align pages. Andi Kleen hated it because 472/*
473 * This is a trivial little helper to align pages. Andi Kleen hated it because
437 * it calls getpagesize() twice: "it's dumb code." 474 * it calls getpagesize() twice: "it's dumb code."
438 * 475 *
439 * Kernel guys get really het up about optimization, even when it's not 476 * Kernel guys get really het up about optimization, even when it's not
440 * necessary. I leave this code as a reaction against that. */ 477 * necessary. I leave this code as a reaction against that.
478 */
441static inline unsigned long page_align(unsigned long addr) 479static inline unsigned long page_align(unsigned long addr)
442{ 480{
443 /* Add upwards and truncate downwards. */ 481 /* Add upwards and truncate downwards. */
444 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 482 return ((addr + getpagesize()-1) & ~(getpagesize()-1));
445} 483}
446 484
447/*L:180 An "initial ram disk" is a disk image loaded into memory along with 485/*L:180
448 * the kernel which the kernel can use to boot from without needing any 486 * An "initial ram disk" is a disk image loaded into memory along with the
449 * drivers. Most distributions now use this as standard: the initrd contains 487 * kernel which the kernel can use to boot from without needing any drivers.
450 * the code to load the appropriate driver modules for the current machine. 488 * Most distributions now use this as standard: the initrd contains the code to
489 * load the appropriate driver modules for the current machine.
451 * 490 *
452 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 491 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
453 * kernels. He sent me this (and tells me when I break it). */ 492 * kernels. He sent me this (and tells me when I break it).
493 */
454static unsigned long load_initrd(const char *name, unsigned long mem) 494static unsigned long load_initrd(const char *name, unsigned long mem)
455{ 495{
456 int ifd; 496 int ifd;
@@ -462,12 +502,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
462 if (fstat(ifd, &st) < 0) 502 if (fstat(ifd, &st) < 0)
463 err(1, "fstat() on initrd '%s'", name); 503 err(1, "fstat() on initrd '%s'", name);
464 504
465 /* We map the initrd at the top of memory, but mmap wants it to be 505 /*
466 * page-aligned, so we round the size up for that. */ 506 * We map the initrd at the top of memory, but mmap wants it to be
507 * page-aligned, so we round the size up for that.
508 */
467 len = page_align(st.st_size); 509 len = page_align(st.st_size);
468 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 510 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
469 /* Once a file is mapped, you can close the file descriptor. It's a 511 /*
470 * little odd, but quite useful. */ 512 * Once a file is mapped, you can close the file descriptor. It's a
513 * little odd, but quite useful.
514 */
471 close(ifd); 515 close(ifd);
472 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 516 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
473 517
@@ -476,8 +520,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
476} 520}
477/*:*/ 521/*:*/
478 522
479/* Simple routine to roll all the commandline arguments together with spaces 523/*
480 * between them. */ 524 * Simple routine to roll all the commandline arguments together with spaces
525 * between them.
526 */
481static void concat(char *dst, char *args[]) 527static void concat(char *dst, char *args[])
482{ 528{
483 unsigned int i, len = 0; 529 unsigned int i, len = 0;
@@ -494,10 +540,12 @@ static void concat(char *dst, char *args[])
494 dst[len] = '\0'; 540 dst[len] = '\0';
495} 541}
496 542
497/*L:185 This is where we actually tell the kernel to initialize the Guest. We 543/*L:185
544 * This is where we actually tell the kernel to initialize the Guest. We
498 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 545 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
499 * the base of Guest "physical" memory, the top physical page to allow and the 546 * the base of Guest "physical" memory, the top physical page to allow and the
500 * entry point for the Guest. */ 547 * entry point for the Guest.
548 */
501static void tell_kernel(unsigned long start) 549static void tell_kernel(unsigned long start)
502{ 550{
503 unsigned long args[] = { LHREQ_INITIALIZE, 551 unsigned long args[] = { LHREQ_INITIALIZE,
@@ -511,7 +559,7 @@ static void tell_kernel(unsigned long start)
511} 559}
512/*:*/ 560/*:*/
513 561
514/* 562/*L:200
515 * Device Handling. 563 * Device Handling.
516 * 564 *
517 * When the Guest gives us a buffer, it sends an array of addresses and sizes. 565 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
@@ -522,20 +570,26 @@ static void tell_kernel(unsigned long start)
522static void *_check_pointer(unsigned long addr, unsigned int size, 570static void *_check_pointer(unsigned long addr, unsigned int size,
523 unsigned int line) 571 unsigned int line)
524{ 572{
525 /* We have to separately check addr and addr+size, because size could 573 /*
526 * be huge and addr + size might wrap around. */ 574 * We have to separately check addr and addr+size, because size could
575 * be huge and addr + size might wrap around.
576 */
527 if (addr >= guest_limit || addr + size >= guest_limit) 577 if (addr >= guest_limit || addr + size >= guest_limit)
528 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 578 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
529 /* We return a pointer for the caller's convenience, now we know it's 579 /*
530 * safe to use. */ 580 * We return a pointer for the caller's convenience, now we know it's
581 * safe to use.
582 */
531 return from_guest_phys(addr); 583 return from_guest_phys(addr);
532} 584}
533/* A macro which transparently hands the line number to the real function. */ 585/* A macro which transparently hands the line number to the real function. */
534#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 586#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
535 587
536/* Each buffer in the virtqueues is actually a chain of descriptors. This 588/*
589 * Each buffer in the virtqueues is actually a chain of descriptors. This
537 * function returns the next descriptor in the chain, or vq->vring.num if we're 590 * function returns the next descriptor in the chain, or vq->vring.num if we're
538 * at the end. */ 591 * at the end.
592 */
539static unsigned next_desc(struct vring_desc *desc, 593static unsigned next_desc(struct vring_desc *desc,
540 unsigned int i, unsigned int max) 594 unsigned int i, unsigned int max)
541{ 595{
@@ -556,7 +610,10 @@ static unsigned next_desc(struct vring_desc *desc,
556 return next; 610 return next;
557} 611}
558 612
559/* This actually sends the interrupt for this virtqueue */ 613/*
614 * This actually sends the interrupt for this virtqueue, if we've used a
615 * buffer.
616 */
560static void trigger_irq(struct virtqueue *vq) 617static void trigger_irq(struct virtqueue *vq)
561{ 618{
562 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 619 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
@@ -576,12 +633,14 @@ static void trigger_irq(struct virtqueue *vq)
576 err(1, "Triggering irq %i", vq->config.irq); 633 err(1, "Triggering irq %i", vq->config.irq);
577} 634}
578 635
579/* This looks in the virtqueue and for the first available buffer, and converts 636/*
637 * This looks in the virtqueue for the first available buffer, and converts
580 * it to an iovec for convenient access. Since descriptors consist of some 638 * it to an iovec for convenient access. Since descriptors consist of some
581 * number of output then some number of input descriptors, it's actually two 639 * number of output then some number of input descriptors, it's actually two
582 * iovecs, but we pack them into one and note how many of each there were. 640 * iovecs, but we pack them into one and note how many of each there were.
583 * 641 *
584 * This function returns the descriptor number found. */ 642 * This function waits if necessary, and returns the descriptor number found.
643 */
585static unsigned wait_for_vq_desc(struct virtqueue *vq, 644static unsigned wait_for_vq_desc(struct virtqueue *vq,
586 struct iovec iov[], 645 struct iovec iov[],
587 unsigned int *out_num, unsigned int *in_num) 646 unsigned int *out_num, unsigned int *in_num)
@@ -590,17 +649,23 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
590 struct vring_desc *desc; 649 struct vring_desc *desc;
591 u16 last_avail = lg_last_avail(vq); 650 u16 last_avail = lg_last_avail(vq);
592 651
652 /* There's nothing available? */
593 while (last_avail == vq->vring.avail->idx) { 653 while (last_avail == vq->vring.avail->idx) {
594 u64 event; 654 u64 event;
595 655
596 /* OK, tell Guest about progress up to now. */ 656 /*
657 * Since we're about to sleep, now is a good time to tell the
658 * Guest about what we've used up to now.
659 */
597 trigger_irq(vq); 660 trigger_irq(vq);
598 661
599 /* OK, now we need to know about added descriptors. */ 662 /* OK, now we need to know about added descriptors. */
600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 663 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
601 664
602 /* They could have slipped one in as we were doing that: make 665 /*
603 * sure it's written, then check again. */ 666 * They could have slipped one in as we were doing that: make
667 * sure it's written, then check again.
668 */
604 mb(); 669 mb();
605 if (last_avail != vq->vring.avail->idx) { 670 if (last_avail != vq->vring.avail->idx) {
606 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 671 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
@@ -620,8 +685,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
620 errx(1, "Guest moved used index from %u to %u", 685 errx(1, "Guest moved used index from %u to %u",
621 last_avail, vq->vring.avail->idx); 686 last_avail, vq->vring.avail->idx);
622 687
623 /* Grab the next descriptor number they're advertising, and increment 688 /*
624 * the index we've seen. */ 689 * Grab the next descriptor number they're advertising, and increment
690 * the index we've seen.
691 */
625 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 692 head = vq->vring.avail->ring[last_avail % vq->vring.num];
626 lg_last_avail(vq)++; 693 lg_last_avail(vq)++;
627 694
@@ -636,8 +703,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
636 desc = vq->vring.desc; 703 desc = vq->vring.desc;
637 i = head; 704 i = head;
638 705
639 /* If this is an indirect entry, then this buffer contains a descriptor 706 /*
640 * table which we handle as if it's any normal descriptor chain. */ 707 * If this is an indirect entry, then this buffer contains a descriptor
708 * table which we handle as if it's any normal descriptor chain.
709 */
641 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 710 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
642 if (desc[i].len % sizeof(struct vring_desc)) 711 if (desc[i].len % sizeof(struct vring_desc))
643 errx(1, "Invalid size for indirect buffer table"); 712 errx(1, "Invalid size for indirect buffer table");
@@ -656,8 +725,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
656 if (desc[i].flags & VRING_DESC_F_WRITE) 725 if (desc[i].flags & VRING_DESC_F_WRITE)
657 (*in_num)++; 726 (*in_num)++;
658 else { 727 else {
659 /* If it's an output descriptor, they're all supposed 728 /*
660 * to come before any input descriptors. */ 729 * If it's an output descriptor, they're all supposed
730 * to come before any input descriptors.
731 */
661 if (*in_num) 732 if (*in_num)
662 errx(1, "Descriptor has out after in"); 733 errx(1, "Descriptor has out after in");
663 (*out_num)++; 734 (*out_num)++;
@@ -671,14 +742,19 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
671 return head; 742 return head;
672} 743}
673 744
674/* After we've used one of their buffers, we tell them about it. We'll then 745/*
675 * want to send them an interrupt, using trigger_irq(). */ 746 * After we've used one of their buffers, we tell the Guest about it. Sometime
747 * later we'll want to send them an interrupt using trigger_irq(); note that
748 * wait_for_vq_desc() does that for us if it has to wait.
749 */
676static void add_used(struct virtqueue *vq, unsigned int head, int len) 750static void add_used(struct virtqueue *vq, unsigned int head, int len)
677{ 751{
678 struct vring_used_elem *used; 752 struct vring_used_elem *used;
679 753
680 /* The virtqueue contains a ring of used buffers. Get a pointer to the 754 /*
681 * next entry in that used ring. */ 755 * The virtqueue contains a ring of used buffers. Get a pointer to the
756 * next entry in that used ring.
757 */
682 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 758 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
683 used->id = head; 759 used->id = head;
684 used->len = len; 760 used->len = len;
@@ -698,9 +774,9 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
698/* 774/*
699 * The Console 775 * The Console
700 * 776 *
701 * We associate some data with the console for our exit hack. */ 777 * We associate some data with the console for our exit hack.
702struct console_abort 778 */
703{ 779struct console_abort {
704 /* How many times have they hit ^C? */ 780 /* How many times have they hit ^C? */
705 int count; 781 int count;
706 /* When did they start? */ 782 /* When did they start? */
@@ -715,30 +791,35 @@ static void console_input(struct virtqueue *vq)
715 struct console_abort *abort = vq->dev->priv; 791 struct console_abort *abort = vq->dev->priv;
716 struct iovec iov[vq->vring.num]; 792 struct iovec iov[vq->vring.num];
717 793
718 /* Make sure there's a descriptor waiting. */ 794 /* Make sure there's a descriptor available. */
719 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 795 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
720 if (out_num) 796 if (out_num)
721 errx(1, "Output buffers in console in queue?"); 797 errx(1, "Output buffers in console in queue?");
722 798
723 /* Read it in. */ 799 /* Read into it. This is where we usually wait. */
724 len = readv(STDIN_FILENO, iov, in_num); 800 len = readv(STDIN_FILENO, iov, in_num);
725 if (len <= 0) { 801 if (len <= 0) {
726 /* Ran out of input? */ 802 /* Ran out of input? */
727 warnx("Failed to get console input, ignoring console."); 803 warnx("Failed to get console input, ignoring console.");
728 /* For simplicity, dying threads kill the whole Launcher. So 804 /*
729 * just nap here. */ 805 * For simplicity, dying threads kill the whole Launcher. So
806 * just nap here.
807 */
730 for (;;) 808 for (;;)
731 pause(); 809 pause();
732 } 810 }
733 811
812 /* Tell the Guest we used a buffer. */
734 add_used_and_trigger(vq, head, len); 813 add_used_and_trigger(vq, head, len);
735 814
736 /* Three ^C within one second? Exit. 815 /*
816 * Three ^C within one second? Exit.
737 * 817 *
738 * This is such a hack, but works surprisingly well. Each ^C has to 818 * This is such a hack, but works surprisingly well. Each ^C has to
739 * be in a buffer by itself, so they can't be too fast. But we check 819 * be in a buffer by itself, so they can't be too fast. But we check
740 * that we get three within about a second, so they can't be too 820 * that we get three within about a second, so they can't be too
741 * slow. */ 821 * slow.
822 */
742 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 823 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
743 abort->count = 0; 824 abort->count = 0;
744 return; 825 return;
@@ -763,15 +844,23 @@ static void console_output(struct virtqueue *vq)
763 unsigned int head, out, in; 844 unsigned int head, out, in;
764 struct iovec iov[vq->vring.num]; 845 struct iovec iov[vq->vring.num];
765 846
847 /* We usually wait in here, for the Guest to give us something. */
766 head = wait_for_vq_desc(vq, iov, &out, &in); 848 head = wait_for_vq_desc(vq, iov, &out, &in);
767 if (in) 849 if (in)
768 errx(1, "Input buffers in console output queue?"); 850 errx(1, "Input buffers in console output queue?");
851
852 /* writev can return a partial write, so we loop here. */
769 while (!iov_empty(iov, out)) { 853 while (!iov_empty(iov, out)) {
770 int len = writev(STDOUT_FILENO, iov, out); 854 int len = writev(STDOUT_FILENO, iov, out);
771 if (len <= 0) 855 if (len <= 0)
772 err(1, "Write to stdout gave %i", len); 856 err(1, "Write to stdout gave %i", len);
773 iov_consume(iov, out, len); 857 iov_consume(iov, out, len);
774 } 858 }
859
860 /*
861 * We're finished with that buffer: if we're going to sleep,
862 * wait_for_vq_desc() will prod the Guest with an interrupt.
863 */
775 add_used(vq, head, 0); 864 add_used(vq, head, 0);
776} 865}
777 866
@@ -791,15 +880,30 @@ static void net_output(struct virtqueue *vq)
791 unsigned int head, out, in; 880 unsigned int head, out, in;
792 struct iovec iov[vq->vring.num]; 881 struct iovec iov[vq->vring.num];
793 882
883 /* We usually wait in here for the Guest to give us a packet. */
794 head = wait_for_vq_desc(vq, iov, &out, &in); 884 head = wait_for_vq_desc(vq, iov, &out, &in);
795 if (in) 885 if (in)
796 errx(1, "Input buffers in net output queue?"); 886 errx(1, "Input buffers in net output queue?");
887 /*
888 * Send the whole thing through to /dev/net/tun. It expects the exact
889 * same format: what a coincidence!
890 */
797 if (writev(net_info->tunfd, iov, out) < 0) 891 if (writev(net_info->tunfd, iov, out) < 0)
798 errx(1, "Write to tun failed?"); 892 errx(1, "Write to tun failed?");
893
894 /*
895 * Done with that one; wait_for_vq_desc() will send the interrupt if
896 * all packets are processed.
897 */
799 add_used(vq, head, 0); 898 add_used(vq, head, 0);
800} 899}
801 900
802/* Will reading from this file descriptor block? */ 901/*
902 * Handling network input is a bit trickier, because I've tried to optimize it.
903 *
904 * First we have a helper routine which tells is if from this file descriptor
905 * (ie. the /dev/net/tun device) will block:
906 */
803static bool will_block(int fd) 907static bool will_block(int fd)
804{ 908{
805 fd_set fdset; 909 fd_set fdset;
@@ -809,8 +913,11 @@ static bool will_block(int fd)
809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 913 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
810} 914}
811 915
812/* This is where we handle packets coming in from the tun device to our 916/*
813 * Guest. */ 917 * This handles packets coming in from the tun device to our Guest. Like all
918 * service routines, it gets called again as soon as it returns, so you don't
919 * see a while(1) loop here.
920 */
814static void net_input(struct virtqueue *vq) 921static void net_input(struct virtqueue *vq)
815{ 922{
816 int len; 923 int len;
@@ -818,21 +925,38 @@ static void net_input(struct virtqueue *vq)
818 struct iovec iov[vq->vring.num]; 925 struct iovec iov[vq->vring.num];
819 struct net_info *net_info = vq->dev->priv; 926 struct net_info *net_info = vq->dev->priv;
820 927
928 /*
929 * Get a descriptor to write an incoming packet into. This will also
930 * send an interrupt if they're out of descriptors.
931 */
821 head = wait_for_vq_desc(vq, iov, &out, &in); 932 head = wait_for_vq_desc(vq, iov, &out, &in);
822 if (out) 933 if (out)
823 errx(1, "Output buffers in net input queue?"); 934 errx(1, "Output buffers in net input queue?");
824 935
825 /* Deliver interrupt now, since we're about to sleep. */ 936 /*
937 * If it looks like we'll block reading from the tun device, send them
938 * an interrupt.
939 */
826 if (vq->pending_used && will_block(net_info->tunfd)) 940 if (vq->pending_used && will_block(net_info->tunfd))
827 trigger_irq(vq); 941 trigger_irq(vq);
828 942
943 /*
944 * Read in the packet. This is where we normally wait (when there's no
945 * incoming network traffic).
946 */
829 len = readv(net_info->tunfd, iov, in); 947 len = readv(net_info->tunfd, iov, in);
830 if (len <= 0) 948 if (len <= 0)
831 err(1, "Failed to read from tun."); 949 err(1, "Failed to read from tun.");
950
951 /*
952 * Mark that packet buffer as used, but don't interrupt here. We want
953 * to wait until we've done as much work as we can.
954 */
832 add_used(vq, head, len); 955 add_used(vq, head, len);
833} 956}
957/*:*/
834 958
835/* This is the helper to create threads. */ 959/* This is the helper to create threads: run the service routine in a loop. */
836static int do_thread(void *_vq) 960static int do_thread(void *_vq)
837{ 961{
838 struct virtqueue *vq = _vq; 962 struct virtqueue *vq = _vq;
@@ -842,8 +966,10 @@ static int do_thread(void *_vq)
842 return 0; 966 return 0;
843} 967}
844 968
845/* When a child dies, we kill our entire process group with SIGTERM. This 969/*
846 * also has the side effect that the shell restores the console for us! */ 970 * When a child dies, we kill our entire process group with SIGTERM. This
971 * also has the side effect that the shell restores the console for us!
972 */
847static void kill_launcher(int signal) 973static void kill_launcher(int signal)
848{ 974{
849 kill(0, SIGTERM); 975 kill(0, SIGTERM);
@@ -878,11 +1004,15 @@ static void reset_device(struct device *dev)
878 signal(SIGCHLD, (void *)kill_launcher); 1004 signal(SIGCHLD, (void *)kill_launcher);
879} 1005}
880 1006
1007/*L:216
1008 * This actually creates the thread which services the virtqueue for a device.
1009 */
881static void create_thread(struct virtqueue *vq) 1010static void create_thread(struct virtqueue *vq)
882{ 1011{
883 /* Create stack for thread and run it. Since stack grows 1012 /*
884 * upwards, we point the stack pointer to the end of this 1013 * Create stack for thread. Since the stack grows upwards, we point
885 * region. */ 1014 * the stack pointer to the end of this region.
1015 */
886 char *stack = malloc(32768); 1016 char *stack = malloc(32768);
887 unsigned long args[] = { LHREQ_EVENTFD, 1017 unsigned long args[] = { LHREQ_EVENTFD,
888 vq->config.pfn*getpagesize(), 0 }; 1018 vq->config.pfn*getpagesize(), 0 };
@@ -893,17 +1023,22 @@ static void create_thread(struct virtqueue *vq)
893 err(1, "Creating eventfd"); 1023 err(1, "Creating eventfd");
894 args[2] = vq->eventfd; 1024 args[2] = vq->eventfd;
895 1025
896 /* Attach an eventfd to this virtqueue: it will go off 1026 /*
897 * when the Guest does an LHCALL_NOTIFY for this vq. */ 1027 * Attach an eventfd to this virtqueue: it will go off when the Guest
1028 * does an LHCALL_NOTIFY for this vq.
1029 */
898 if (write(lguest_fd, &args, sizeof(args)) != 0) 1030 if (write(lguest_fd, &args, sizeof(args)) != 0)
899 err(1, "Attaching eventfd"); 1031 err(1, "Attaching eventfd");
900 1032
901 /* CLONE_VM: because it has to access the Guest memory, and 1033 /*
902 * SIGCHLD so we get a signal if it dies. */ 1034 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1035 * we get a signal if it dies.
1036 */
903 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1037 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
904 if (vq->thread == (pid_t)-1) 1038 if (vq->thread == (pid_t)-1)
905 err(1, "Creating clone"); 1039 err(1, "Creating clone");
906 /* We close our local copy, now the child has it. */ 1040
1041 /* We close our local copy now the child has it. */
907 close(vq->eventfd); 1042 close(vq->eventfd);
908} 1043}
909 1044
@@ -955,7 +1090,10 @@ static void update_device_status(struct device *dev)
955 } 1090 }
956} 1091}
957 1092
958/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 1093/*L:215
1094 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1095 * particular, it's used to notify us of device status changes during boot.
1096 */
959static void handle_output(unsigned long addr) 1097static void handle_output(unsigned long addr)
960{ 1098{
961 struct device *i; 1099 struct device *i;
@@ -964,25 +1102,42 @@ static void handle_output(unsigned long addr)
964 for (i = devices.dev; i; i = i->next) { 1102 for (i = devices.dev; i; i = i->next) {
965 struct virtqueue *vq; 1103 struct virtqueue *vq;
966 1104
967 /* Notifications to device descriptors update device status. */ 1105 /*
1106 * Notifications to device descriptors mean they updated the
1107 * device status.
1108 */
968 if (from_guest_phys(addr) == i->desc) { 1109 if (from_guest_phys(addr) == i->desc) {
969 update_device_status(i); 1110 update_device_status(i);
970 return; 1111 return;
971 } 1112 }
972 1113
973 /* Devices *can* be used before status is set to DRIVER_OK. */ 1114 /*
1115 * Devices *can* be used before status is set to DRIVER_OK.
1116 * The original plan was that they would never do this: they
1117 * would always finish setting up their status bits before
1118 * actually touching the virtqueues. In practice, we allowed
1119 * them to, and they do (eg. the disk probes for partition
1120 * tables as part of initialization).
1121 *
1122 * If we see this, we start the device: once it's running, we
1123 * expect the device to catch all the notifications.
1124 */
974 for (vq = i->vq; vq; vq = vq->next) { 1125 for (vq = i->vq; vq; vq = vq->next) {
975 if (addr != vq->config.pfn*getpagesize()) 1126 if (addr != vq->config.pfn*getpagesize())
976 continue; 1127 continue;
977 if (i->running) 1128 if (i->running)
978 errx(1, "Notification on running %s", i->name); 1129 errx(1, "Notification on running %s", i->name);
1130 /* This just calls create_thread() for each virtqueue */
979 start_device(i); 1131 start_device(i);
980 return; 1132 return;
981 } 1133 }
982 } 1134 }
983 1135
984 /* Early console write is done using notify on a nul-terminated string 1136 /*
985 * in Guest memory. */ 1137 * Early console write is done using notify on a nul-terminated string
1138 * in Guest memory. It's also great for hacking debugging messages
1139 * into a Guest.
1140 */
986 if (addr >= guest_limit) 1141 if (addr >= guest_limit)
987 errx(1, "Bad NOTIFY %#lx", addr); 1142 errx(1, "Bad NOTIFY %#lx", addr);
988 1143
@@ -998,10 +1153,12 @@ static void handle_output(unsigned long addr)
998 * routines to allocate and manage them. 1153 * routines to allocate and manage them.
999 */ 1154 */
1000 1155
1001/* The layout of the device page is a "struct lguest_device_desc" followed by a 1156/*
1157 * The layout of the device page is a "struct lguest_device_desc" followed by a
1002 * number of virtqueue descriptors, then two sets of feature bits, then an 1158 * number of virtqueue descriptors, then two sets of feature bits, then an
1003 * array of configuration bytes. This routine returns the configuration 1159 * array of configuration bytes. This routine returns the configuration
1004 * pointer. */ 1160 * pointer.
1161 */
1005static u8 *device_config(const struct device *dev) 1162static u8 *device_config(const struct device *dev)
1006{ 1163{
1007 return (void *)(dev->desc + 1) 1164 return (void *)(dev->desc + 1)
@@ -1009,9 +1166,11 @@ static u8 *device_config(const struct device *dev)
1009 + dev->feature_len * 2; 1166 + dev->feature_len * 2;
1010} 1167}
1011 1168
1012/* This routine allocates a new "struct lguest_device_desc" from descriptor 1169/*
1170 * This routine allocates a new "struct lguest_device_desc" from descriptor
1013 * table page just above the Guest's normal memory. It returns a pointer to 1171 * table page just above the Guest's normal memory. It returns a pointer to
1014 * that descriptor. */ 1172 * that descriptor.
1173 */
1015static struct lguest_device_desc *new_dev_desc(u16 type) 1174static struct lguest_device_desc *new_dev_desc(u16 type)
1016{ 1175{
1017 struct lguest_device_desc d = { .type = type }; 1176 struct lguest_device_desc d = { .type = type };
@@ -1032,8 +1191,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1032 return memcpy(p, &d, sizeof(d)); 1191 return memcpy(p, &d, sizeof(d));
1033} 1192}
1034 1193
1035/* Each device descriptor is followed by the description of its virtqueues. We 1194/*
1036 * specify how many descriptors the virtqueue is to have. */ 1195 * Each device descriptor is followed by the description of its virtqueues. We
1196 * specify how many descriptors the virtqueue is to have.
1197 */
1037static void add_virtqueue(struct device *dev, unsigned int num_descs, 1198static void add_virtqueue(struct device *dev, unsigned int num_descs,
1038 void (*service)(struct virtqueue *)) 1199 void (*service)(struct virtqueue *))
1039{ 1200{
@@ -1050,6 +1211,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1050 vq->next = NULL; 1211 vq->next = NULL;
1051 vq->last_avail_idx = 0; 1212 vq->last_avail_idx = 0;
1052 vq->dev = dev; 1213 vq->dev = dev;
1214
1215 /*
1216 * This is the routine the service thread will run, and its Process ID
1217 * once it's running.
1218 */
1053 vq->service = service; 1219 vq->service = service;
1054 vq->thread = (pid_t)-1; 1220 vq->thread = (pid_t)-1;
1055 1221
@@ -1061,10 +1227,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1061 /* Initialize the vring. */ 1227 /* Initialize the vring. */
1062 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); 1228 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
1063 1229
1064 /* Append virtqueue to this device's descriptor. We use 1230 /*
1231 * Append virtqueue to this device's descriptor. We use
1065 * device_config() to get the end of the device's current virtqueues; 1232 * device_config() to get the end of the device's current virtqueues;
1066 * we check that we haven't added any config or feature information 1233 * we check that we haven't added any config or feature information
1067 * yet, otherwise we'd be overwriting them. */ 1234 * yet, otherwise we'd be overwriting them.
1235 */
1068 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1236 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1069 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1237 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1070 dev->num_vq++; 1238 dev->num_vq++;
@@ -1072,14 +1240,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1072 1240
1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1241 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
1074 1242
1075 /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1243 /*
1076 * second. */ 1244 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
1245 * second.
1246 */
1077 for (i = &dev->vq; *i; i = &(*i)->next); 1247 for (i = &dev->vq; *i; i = &(*i)->next);
1078 *i = vq; 1248 *i = vq;
1079} 1249}
1080 1250
1081/* The first half of the feature bitmask is for us to advertise features. The 1251/*
1082 * second half is for the Guest to accept features. */ 1252 * The first half of the feature bitmask is for us to advertise features. The
1253 * second half is for the Guest to accept features.
1254 */
1083static void add_feature(struct device *dev, unsigned bit) 1255static void add_feature(struct device *dev, unsigned bit)
1084{ 1256{
1085 u8 *features = get_feature_bits(dev); 1257 u8 *features = get_feature_bits(dev);
@@ -1093,9 +1265,11 @@ static void add_feature(struct device *dev, unsigned bit)
1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1265 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
1094} 1266}
1095 1267
1096/* This routine sets the configuration fields for an existing device's 1268/*
1269 * This routine sets the configuration fields for an existing device's
1097 * descriptor. It only works for the last device, but that's OK because that's 1270 * descriptor. It only works for the last device, but that's OK because that's
1098 * how we use it. */ 1271 * how we use it.
1272 */
1099static void set_config(struct device *dev, unsigned len, const void *conf) 1273static void set_config(struct device *dev, unsigned len, const void *conf)
1100{ 1274{
1101 /* Check we haven't overflowed our single page. */ 1275 /* Check we haven't overflowed our single page. */
@@ -1105,12 +1279,18 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1105 /* Copy in the config information, and store the length. */ 1279 /* Copy in the config information, and store the length. */
1106 memcpy(device_config(dev), conf, len); 1280 memcpy(device_config(dev), conf, len);
1107 dev->desc->config_len = len; 1281 dev->desc->config_len = len;
1282
1283 /* Size must fit in config_len field (8 bits)! */
1284 assert(dev->desc->config_len == len);
1108} 1285}
1109 1286
1110/* This routine does all the creation and setup of a new device, including 1287/*
1111 * calling new_dev_desc() to allocate the descriptor and device memory. 1288 * This routine does all the creation and setup of a new device, including
1289 * calling new_dev_desc() to allocate the descriptor and device memory. We
1290 * don't actually start the service threads until later.
1112 * 1291 *
1113 * See what I mean about userspace being boring? */ 1292 * See what I mean about userspace being boring?
1293 */
1114static struct device *new_device(const char *name, u16 type) 1294static struct device *new_device(const char *name, u16 type)
1115{ 1295{
1116 struct device *dev = malloc(sizeof(*dev)); 1296 struct device *dev = malloc(sizeof(*dev));
@@ -1123,10 +1303,12 @@ static struct device *new_device(const char *name, u16 type)
1123 dev->num_vq = 0; 1303 dev->num_vq = 0;
1124 dev->running = false; 1304 dev->running = false;
1125 1305
1126 /* Append to device list. Prepending to a single-linked list is 1306 /*
1307 * Append to device list. Prepending to a single-linked list is
1127 * easier, but the user expects the devices to be arranged on the bus 1308 * easier, but the user expects the devices to be arranged on the bus
1128 * in command-line order. The first network device on the command line 1309 * in command-line order. The first network device on the command line
1129 * is eth0, the first block device /dev/vda, etc. */ 1310 * is eth0, the first block device /dev/vda, etc.
1311 */
1130 if (devices.lastdev) 1312 if (devices.lastdev)
1131 devices.lastdev->next = dev; 1313 devices.lastdev->next = dev;
1132 else 1314 else
@@ -1136,8 +1318,10 @@ static struct device *new_device(const char *name, u16 type)
1136 return dev; 1318 return dev;
1137} 1319}
1138 1320
1139/* Our first setup routine is the console. It's a fairly simple device, but 1321/*
1140 * UNIX tty handling makes it uglier than it could be. */ 1322 * Our first setup routine is the console. It's a fairly simple device, but
1323 * UNIX tty handling makes it uglier than it could be.
1324 */
1141static void setup_console(void) 1325static void setup_console(void)
1142{ 1326{
1143 struct device *dev; 1327 struct device *dev;
@@ -1145,8 +1329,10 @@ static void setup_console(void)
1145 /* If we can save the initial standard input settings... */ 1329 /* If we can save the initial standard input settings... */
1146 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1330 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
1147 struct termios term = orig_term; 1331 struct termios term = orig_term;
1148 /* Then we turn off echo, line buffering and ^C etc. We want a 1332 /*
1149 * raw input stream to the Guest. */ 1333 * Then we turn off echo, line buffering and ^C etc: We want a
1334 * raw input stream to the Guest.
1335 */
1150 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1336 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1151 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1337 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1152 } 1338 }
@@ -1157,10 +1343,12 @@ static void setup_console(void)
1157 dev->priv = malloc(sizeof(struct console_abort)); 1343 dev->priv = malloc(sizeof(struct console_abort));
1158 ((struct console_abort *)dev->priv)->count = 0; 1344 ((struct console_abort *)dev->priv)->count = 0;
1159 1345
1160 /* The console needs two virtqueues: the input then the output. When 1346 /*
1347 * The console needs two virtqueues: the input then the output. When
1161 * they put something the input queue, we make sure we're listening to 1348 * they put something the input queue, we make sure we're listening to
1162 * stdin. When they put something in the output queue, we write it to 1349 * stdin. When they put something in the output queue, we write it to
1163 * stdout. */ 1350 * stdout.
1351 */
1164 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 1352 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1165 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 1353 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1166 1354
@@ -1168,7 +1356,8 @@ static void setup_console(void)
1168} 1356}
1169/*:*/ 1357/*:*/
1170 1358
1171/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1359/*M:010
1360 * Inter-guest networking is an interesting area. Simplest is to have a
1172 * --sharenet=<name> option which opens or creates a named pipe. This can be 1361 * --sharenet=<name> option which opens or creates a named pipe. This can be
1173 * used to send packets to another guest in a 1:1 manner. 1362 * used to send packets to another guest in a 1:1 manner.
1174 * 1363 *
@@ -1182,7 +1371,8 @@ static void setup_console(void)
1182 * multiple inter-guest channels behind one interface, although it would 1371 * multiple inter-guest channels behind one interface, although it would
1183 * require some manner of hotplugging new virtio channels. 1372 * require some manner of hotplugging new virtio channels.
1184 * 1373 *
1185 * Finally, we could implement a virtio network switch in the kernel. :*/ 1374 * Finally, we could implement a virtio network switch in the kernel.
1375:*/
1186 1376
1187static u32 str2ip(const char *ipaddr) 1377static u32 str2ip(const char *ipaddr)
1188{ 1378{
@@ -1207,11 +1397,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6])
1207 mac[5] = m[5]; 1397 mac[5] = m[5];
1208} 1398}
1209 1399
1210/* This code is "adapted" from libbridge: it attaches the Host end of the 1400/*
1401 * This code is "adapted" from libbridge: it attaches the Host end of the
1211 * network device to the bridge device specified by the command line. 1402 * network device to the bridge device specified by the command line.
1212 * 1403 *
1213 * This is yet another James Morris contribution (I'm an IP-level guy, so I 1404 * This is yet another James Morris contribution (I'm an IP-level guy, so I
1214 * dislike bridging), and I just try not to break it. */ 1405 * dislike bridging), and I just try not to break it.
1406 */
1215static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1407static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1216{ 1408{
1217 int ifidx; 1409 int ifidx;
@@ -1231,9 +1423,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
1231 err(1, "can't add %s to bridge %s", if_name, br_name); 1423 err(1, "can't add %s to bridge %s", if_name, br_name);
1232} 1424}
1233 1425
1234/* This sets up the Host end of the network device with an IP address, brings 1426/*
1427 * This sets up the Host end of the network device with an IP address, brings
1235 * it up so packets will flow, the copies the MAC address into the hwaddr 1428 * it up so packets will flow, the copies the MAC address into the hwaddr
1236 * pointer. */ 1429 * pointer.
1430 */
1237static void configure_device(int fd, const char *tapif, u32 ipaddr) 1431static void configure_device(int fd, const char *tapif, u32 ipaddr)
1238{ 1432{
1239 struct ifreq ifr; 1433 struct ifreq ifr;
@@ -1260,10 +1454,12 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1260 /* Start with this zeroed. Messy but sure. */ 1454 /* Start with this zeroed. Messy but sure. */
1261 memset(&ifr, 0, sizeof(ifr)); 1455 memset(&ifr, 0, sizeof(ifr));
1262 1456
1263 /* We open the /dev/net/tun device and tell it we want a tap device. A 1457 /*
1458 * We open the /dev/net/tun device and tell it we want a tap device. A
1264 * tap device is like a tun device, only somehow different. To tell 1459 * tap device is like a tun device, only somehow different. To tell
1265 * the truth, I completely blundered my way through this code, but it 1460 * the truth, I completely blundered my way through this code, but it
1266 * works now! */ 1461 * works now!
1462 */
1267 netfd = open_or_die("/dev/net/tun", O_RDWR); 1463 netfd = open_or_die("/dev/net/tun", O_RDWR);
1268 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1464 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
1269 strcpy(ifr.ifr_name, "tap%d"); 1465 strcpy(ifr.ifr_name, "tap%d");
@@ -1274,18 +1470,22 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1274 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1470 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
1275 err(1, "Could not set features for tun device"); 1471 err(1, "Could not set features for tun device");
1276 1472
1277 /* We don't need checksums calculated for packets coming in this 1473 /*
1278 * device: trust us! */ 1474 * We don't need checksums calculated for packets coming in this
1475 * device: trust us!
1476 */
1279 ioctl(netfd, TUNSETNOCSUM, 1); 1477 ioctl(netfd, TUNSETNOCSUM, 1);
1280 1478
1281 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1479 memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
1282 return netfd; 1480 return netfd;
1283} 1481}
1284 1482
1285/*L:195 Our network is a Host<->Guest network. This can either use bridging or 1483/*L:195
1484 * Our network is a Host<->Guest network. This can either use bridging or
1286 * routing, but the principle is the same: it uses the "tun" device to inject 1485 * routing, but the principle is the same: it uses the "tun" device to inject
1287 * packets into the Host as if they came in from a normal network card. We 1486 * packets into the Host as if they came in from a normal network card. We
1288 * just shunt packets between the Guest and the tun device. */ 1487 * just shunt packets between the Guest and the tun device.
1488 */
1289static void setup_tun_net(char *arg) 1489static void setup_tun_net(char *arg)
1290{ 1490{
1291 struct device *dev; 1491 struct device *dev;
@@ -1302,13 +1502,14 @@ static void setup_tun_net(char *arg)
1302 dev = new_device("net", VIRTIO_ID_NET); 1502 dev = new_device("net", VIRTIO_ID_NET);
1303 dev->priv = net_info; 1503 dev->priv = net_info;
1304 1504
1305 /* Network devices need a receive and a send queue, just like 1505 /* Network devices need a recv and a send queue, just like console. */
1306 * console. */
1307 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 1506 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1308 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 1507 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1309 1508
1310 /* We need a socket to perform the magic network ioctls to bring up the 1509 /*
1311 * tap interface, connect to the bridge etc. Any socket will do! */ 1510 * We need a socket to perform the magic network ioctls to bring up the
1511 * tap interface, connect to the bridge etc. Any socket will do!
1512 */
1312 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1513 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
1313 if (ipfd < 0) 1514 if (ipfd < 0)
1314 err(1, "opening IP socket"); 1515 err(1, "opening IP socket");
@@ -1362,39 +1563,31 @@ static void setup_tun_net(char *arg)
1362 verbose("device %u: tun %s: %s\n", 1563 verbose("device %u: tun %s: %s\n",
1363 devices.device_num, tapif, arg); 1564 devices.device_num, tapif, arg);
1364} 1565}
1365 1566/*:*/
1366/* Our block (disk) device should be really simple: the Guest asks for a block
1367 * number and we read or write that position in the file. Unfortunately, that
1368 * was amazingly slow: the Guest waits until the read is finished before
1369 * running anything else, even if it could have been doing useful work.
1370 *
1371 * We could use async I/O, except it's reputed to suck so hard that characters
1372 * actually go missing from your code when you try to use it.
1373 *
1374 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1375 1567
1376/* This hangs off device->priv. */ 1568/* This hangs off device->priv. */
1377struct vblk_info 1569struct vblk_info {
1378{
1379 /* The size of the file. */ 1570 /* The size of the file. */
1380 off64_t len; 1571 off64_t len;
1381 1572
1382 /* The file descriptor for the file. */ 1573 /* The file descriptor for the file. */
1383 int fd; 1574 int fd;
1384 1575
1385 /* IO thread listens on this file descriptor [0]. */
1386 int workpipe[2];
1387
1388 /* IO thread writes to this file descriptor to mark it done, then
1389 * Launcher triggers interrupt to Guest. */
1390 int done_fd;
1391}; 1576};
1392 1577
1393/*L:210 1578/*L:210
1394 * The Disk 1579 * The Disk
1395 * 1580 *
1396 * Remember that the block device is handled by a separate I/O thread. We head 1581 * The disk only has one virtqueue, so it only has one thread. It is really
1397 * straight into the core of that thread here: 1582 * simple: the Guest asks for a block number and we read or write that position
1583 * in the file.
1584 *
1585 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1586 * slow: the Guest waits until the read is finished before running anything
1587 * else, even if it could have been doing useful work.
1588 *
1589 * We could have used async I/O, except it's reputed to suck so hard that
1590 * characters actually go missing from your code when you try to use it.
1398 */ 1591 */
1399static void blk_request(struct virtqueue *vq) 1592static void blk_request(struct virtqueue *vq)
1400{ 1593{
@@ -1406,47 +1599,64 @@ static void blk_request(struct virtqueue *vq)
1406 struct iovec iov[vq->vring.num]; 1599 struct iovec iov[vq->vring.num];
1407 off64_t off; 1600 off64_t off;
1408 1601
1409 /* Get the next request. */ 1602 /*
1603 * Get the next request, where we normally wait. It triggers the
1604 * interrupt to acknowledge previously serviced requests (if any).
1605 */
1410 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1606 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1411 1607
1412 /* Every block request should contain at least one output buffer 1608 /*
1609 * Every block request should contain at least one output buffer
1413 * (detailing the location on disk and the type of request) and one 1610 * (detailing the location on disk and the type of request) and one
1414 * input buffer (to hold the result). */ 1611 * input buffer (to hold the result).
1612 */
1415 if (out_num == 0 || in_num == 0) 1613 if (out_num == 0 || in_num == 0)
1416 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1614 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1417 head, out_num, in_num); 1615 head, out_num, in_num);
1418 1616
1419 out = convert(&iov[0], struct virtio_blk_outhdr); 1617 out = convert(&iov[0], struct virtio_blk_outhdr);
1420 in = convert(&iov[out_num+in_num-1], u8); 1618 in = convert(&iov[out_num+in_num-1], u8);
1619 /*
1620 * For historical reasons, block operations are expressed in 512 byte
1621 * "sectors".
1622 */
1421 off = out->sector * 512; 1623 off = out->sector * 512;
1422 1624
1423 /* The block device implements "barriers", where the Guest indicates 1625 /*
1626 * The block device implements "barriers", where the Guest indicates
1424 * that it wants all previous writes to occur before this write. We 1627 * that it wants all previous writes to occur before this write. We
1425 * don't have a way of asking our kernel to do a barrier, so we just 1628 * don't have a way of asking our kernel to do a barrier, so we just
1426 * synchronize all the data in the file. Pretty poor, no? */ 1629 * synchronize all the data in the file. Pretty poor, no?
1630 */
1427 if (out->type & VIRTIO_BLK_T_BARRIER) 1631 if (out->type & VIRTIO_BLK_T_BARRIER)
1428 fdatasync(vblk->fd); 1632 fdatasync(vblk->fd);
1429 1633
1430 /* In general the virtio block driver is allowed to try SCSI commands. 1634 /*
1431 * It'd be nice if we supported eject, for example, but we don't. */ 1635 * In general the virtio block driver is allowed to try SCSI commands.
1636 * It'd be nice if we supported eject, for example, but we don't.
1637 */
1432 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1638 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1433 fprintf(stderr, "Scsi commands unsupported\n"); 1639 fprintf(stderr, "Scsi commands unsupported\n");
1434 *in = VIRTIO_BLK_S_UNSUPP; 1640 *in = VIRTIO_BLK_S_UNSUPP;
1435 wlen = sizeof(*in); 1641 wlen = sizeof(*in);
1436 } else if (out->type & VIRTIO_BLK_T_OUT) { 1642 } else if (out->type & VIRTIO_BLK_T_OUT) {
1437 /* Write */ 1643 /*
1438 1644 * Write
1439 /* Move to the right location in the block file. This can fail 1645 *
1440 * if they try to write past end. */ 1646 * Move to the right location in the block file. This can fail
1647 * if they try to write past end.
1648 */
1441 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1649 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1442 err(1, "Bad seek to sector %llu", out->sector); 1650 err(1, "Bad seek to sector %llu", out->sector);
1443 1651
1444 ret = writev(vblk->fd, iov+1, out_num-1); 1652 ret = writev(vblk->fd, iov+1, out_num-1);
1445 verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1653 verbose("WRITE to sector %llu: %i\n", out->sector, ret);
1446 1654
1447 /* Grr... Now we know how long the descriptor they sent was, we 1655 /*
1656 * Grr... Now we know how long the descriptor they sent was, we
1448 * make sure they didn't try to write over the end of the block 1657 * make sure they didn't try to write over the end of the block
1449 * file (possibly extending it). */ 1658 * file (possibly extending it).
1659 */
1450 if (ret > 0 && off + ret > vblk->len) { 1660 if (ret > 0 && off + ret > vblk->len) {
1451 /* Trim it back to the correct length */ 1661 /* Trim it back to the correct length */
1452 ftruncate64(vblk->fd, vblk->len); 1662 ftruncate64(vblk->fd, vblk->len);
@@ -1456,10 +1666,12 @@ static void blk_request(struct virtqueue *vq)
1456 wlen = sizeof(*in); 1666 wlen = sizeof(*in);
1457 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1667 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
1458 } else { 1668 } else {
1459 /* Read */ 1669 /*
1460 1670 * Read
1461 /* Move to the right location in the block file. This can fail 1671 *
1462 * if they try to read past end. */ 1672 * Move to the right location in the block file. This can fail
1673 * if they try to read past end.
1674 */
1463 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1675 if (lseek64(vblk->fd, off, SEEK_SET) != off)
1464 err(1, "Bad seek to sector %llu", out->sector); 1676 err(1, "Bad seek to sector %llu", out->sector);
1465 1677
@@ -1474,13 +1686,16 @@ static void blk_request(struct virtqueue *vq)
1474 } 1686 }
1475 } 1687 }
1476 1688
1477 /* OK, so we noted that it was pretty poor to use an fdatasync as a 1689 /*
1690 * OK, so we noted that it was pretty poor to use an fdatasync as a
1478 * barrier. But Christoph Hellwig points out that we need a sync 1691 * barrier. But Christoph Hellwig points out that we need a sync
1479 * *afterwards* as well: "Barriers specify no reordering to the front 1692 * *afterwards* as well: "Barriers specify no reordering to the front
1480 * or the back." And Jens Axboe confirmed it, so here we are: */ 1693 * or the back." And Jens Axboe confirmed it, so here we are:
1694 */
1481 if (out->type & VIRTIO_BLK_T_BARRIER) 1695 if (out->type & VIRTIO_BLK_T_BARRIER)
1482 fdatasync(vblk->fd); 1696 fdatasync(vblk->fd);
1483 1697
1698 /* Finished that request. */
1484 add_used(vq, head, wlen); 1699 add_used(vq, head, wlen);
1485} 1700}
1486 1701
@@ -1491,7 +1706,7 @@ static void setup_block_file(const char *filename)
1491 struct vblk_info *vblk; 1706 struct vblk_info *vblk;
1492 struct virtio_blk_config conf; 1707 struct virtio_blk_config conf;
1493 1708
1494 /* The device responds to return from I/O thread. */ 1709 /* Creat the device. */
1495 dev = new_device("block", VIRTIO_ID_BLOCK); 1710 dev = new_device("block", VIRTIO_ID_BLOCK);
1496 1711
1497 /* The device has one virtqueue, where the Guest places requests. */ 1712 /* The device has one virtqueue, where the Guest places requests. */
@@ -1510,27 +1725,32 @@ static void setup_block_file(const char *filename)
1510 /* Tell Guest how many sectors this device has. */ 1725 /* Tell Guest how many sectors this device has. */
1511 conf.capacity = cpu_to_le64(vblk->len / 512); 1726 conf.capacity = cpu_to_le64(vblk->len / 512);
1512 1727
1513 /* Tell Guest not to put in too many descriptors at once: two are used 1728 /*
1514 * for the in and out elements. */ 1729 * Tell Guest not to put in too many descriptors at once: two are used
1730 * for the in and out elements.
1731 */
1515 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 1732 add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
1516 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 1733 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
1517 1734
1518 set_config(dev, sizeof(conf), &conf); 1735 /* Don't try to put whole struct: we have 8 bit limit. */
1736 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
1519 1737
1520 verbose("device %u: virtblock %llu sectors\n", 1738 verbose("device %u: virtblock %llu sectors\n",
1521 ++devices.device_num, le64_to_cpu(conf.capacity)); 1739 ++devices.device_num, le64_to_cpu(conf.capacity));
1522} 1740}
1523 1741
1524struct rng_info { 1742/*L:211
1525 int rfd; 1743 * Our random number generator device reads from /dev/random into the Guest's
1526};
1527
1528/* Our random number generator device reads from /dev/random into the Guest's
1529 * input buffers. The usual case is that the Guest doesn't want random numbers 1744 * input buffers. The usual case is that the Guest doesn't want random numbers
1530 * and so has no buffers although /dev/random is still readable, whereas 1745 * and so has no buffers although /dev/random is still readable, whereas
1531 * console is the reverse. 1746 * console is the reverse.
1532 * 1747 *
1533 * The same logic applies, however. */ 1748 * The same logic applies, however.
1749 */
1750struct rng_info {
1751 int rfd;
1752};
1753
1534static void rng_input(struct virtqueue *vq) 1754static void rng_input(struct virtqueue *vq)
1535{ 1755{
1536 int len; 1756 int len;
@@ -1543,9 +1763,10 @@ static void rng_input(struct virtqueue *vq)
1543 if (out_num) 1763 if (out_num)
1544 errx(1, "Output buffers in rng?"); 1764 errx(1, "Output buffers in rng?");
1545 1765
1546 /* This is why we convert to iovecs: the readv() call uses them, and so 1766 /*
1547 * it reads straight into the Guest's buffer. We loop to make sure we 1767 * Just like the console write, we loop to cover the whole iovec.
1548 * fill it. */ 1768 * In this case, short reads actually happen quite a bit.
1769 */
1549 while (!iov_empty(iov, in_num)) { 1770 while (!iov_empty(iov, in_num)) {
1550 len = readv(rng_info->rfd, iov, in_num); 1771 len = readv(rng_info->rfd, iov, in_num);
1551 if (len <= 0) 1772 if (len <= 0)
@@ -1558,15 +1779,18 @@ static void rng_input(struct virtqueue *vq)
1558 add_used(vq, head, totlen); 1779 add_used(vq, head, totlen);
1559} 1780}
1560 1781
1561/* And this creates a "hardware" random number device for the Guest. */ 1782/*L:199
1783 * This creates a "hardware" random number device for the Guest.
1784 */
1562static void setup_rng(void) 1785static void setup_rng(void)
1563{ 1786{
1564 struct device *dev; 1787 struct device *dev;
1565 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 1788 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1566 1789
1790 /* Our device's privat info simply contains the /dev/random fd. */
1567 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 1791 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1568 1792
1569 /* The device responds to return from I/O thread. */ 1793 /* Create the new device. */
1570 dev = new_device("rng", VIRTIO_ID_RNG); 1794 dev = new_device("rng", VIRTIO_ID_RNG);
1571 dev->priv = rng_info; 1795 dev->priv = rng_info;
1572 1796
@@ -1582,8 +1806,10 @@ static void __attribute__((noreturn)) restart_guest(void)
1582{ 1806{
1583 unsigned int i; 1807 unsigned int i;
1584 1808
1585 /* Since we don't track all open fds, we simply close everything beyond 1809 /*
1586 * stderr. */ 1810 * Since we don't track all open fds, we simply close everything beyond
1811 * stderr.
1812 */
1587 for (i = 3; i < FD_SETSIZE; i++) 1813 for (i = 3; i < FD_SETSIZE; i++)
1588 close(i); 1814 close(i);
1589 1815
@@ -1594,8 +1820,10 @@ static void __attribute__((noreturn)) restart_guest(void)
1594 err(1, "Could not exec %s", main_args[0]); 1820 err(1, "Could not exec %s", main_args[0]);
1595} 1821}
1596 1822
1597/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1823/*L:220
1598 * its input and output, and finally, lays it to rest. */ 1824 * Finally we reach the core of the Launcher which runs the Guest, serves
1825 * its input and output, and finally, lays it to rest.
1826 */
1599static void __attribute__((noreturn)) run_guest(void) 1827static void __attribute__((noreturn)) run_guest(void)
1600{ 1828{
1601 for (;;) { 1829 for (;;) {
@@ -1630,7 +1858,7 @@ static void __attribute__((noreturn)) run_guest(void)
1630 * 1858 *
1631 * Are you ready? Take a deep breath and join me in the core of the Host, in 1859 * Are you ready? Take a deep breath and join me in the core of the Host, in
1632 * "make Host". 1860 * "make Host".
1633 :*/ 1861:*/
1634 1862
1635static struct option opts[] = { 1863static struct option opts[] = {
1636 { "verbose", 0, NULL, 'v' }, 1864 { "verbose", 0, NULL, 'v' },
@@ -1651,8 +1879,7 @@ static void usage(void)
1651/*L:105 The main routine is where the real work begins: */ 1879/*L:105 The main routine is where the real work begins: */
1652int main(int argc, char *argv[]) 1880int main(int argc, char *argv[])
1653{ 1881{
1654 /* Memory, top-level pagetable, code startpoint and size of the 1882 /* Memory, code startpoint and size of the (optional) initrd. */
1655 * (optional) initrd. */
1656 unsigned long mem = 0, start, initrd_size = 0; 1883 unsigned long mem = 0, start, initrd_size = 0;
1657 /* Two temporaries. */ 1884 /* Two temporaries. */
1658 int i, c; 1885 int i, c;
@@ -1664,24 +1891,32 @@ int main(int argc, char *argv[])
1664 /* Save the args: we "reboot" by execing ourselves again. */ 1891 /* Save the args: we "reboot" by execing ourselves again. */
1665 main_args = argv; 1892 main_args = argv;
1666 1893
1667 /* First we initialize the device list. We keep a pointer to the last 1894 /*
1895 * First we initialize the device list. We keep a pointer to the last
1668 * device, and the next interrupt number to use for devices (1: 1896 * device, and the next interrupt number to use for devices (1:
1669 * remember that 0 is used by the timer). */ 1897 * remember that 0 is used by the timer).
1898 */
1670 devices.lastdev = NULL; 1899 devices.lastdev = NULL;
1671 devices.next_irq = 1; 1900 devices.next_irq = 1;
1672 1901
1902 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1673 cpu_id = 0; 1903 cpu_id = 0;
1674 /* We need to know how much memory so we can set up the device 1904
1905 /*
1906 * We need to know how much memory so we can set up the device
1675 * descriptor and memory pages for the devices as we parse the command 1907 * descriptor and memory pages for the devices as we parse the command
1676 * line. So we quickly look through the arguments to find the amount 1908 * line. So we quickly look through the arguments to find the amount
1677 * of memory now. */ 1909 * of memory now.
1910 */
1678 for (i = 1; i < argc; i++) { 1911 for (i = 1; i < argc; i++) {
1679 if (argv[i][0] != '-') { 1912 if (argv[i][0] != '-') {
1680 mem = atoi(argv[i]) * 1024 * 1024; 1913 mem = atoi(argv[i]) * 1024 * 1024;
1681 /* We start by mapping anonymous pages over all of 1914 /*
1915 * We start by mapping anonymous pages over all of
1682 * guest-physical memory range. This fills it with 0, 1916 * guest-physical memory range. This fills it with 0,
1683 * and ensures that the Guest won't be killed when it 1917 * and ensures that the Guest won't be killed when it
1684 * tries to access it. */ 1918 * tries to access it.
1919 */
1685 guest_base = map_zeroed_pages(mem / getpagesize() 1920 guest_base = map_zeroed_pages(mem / getpagesize()
1686 + DEVICE_PAGES); 1921 + DEVICE_PAGES);
1687 guest_limit = mem; 1922 guest_limit = mem;
@@ -1714,8 +1949,10 @@ int main(int argc, char *argv[])
1714 usage(); 1949 usage();
1715 } 1950 }
1716 } 1951 }
1717 /* After the other arguments we expect memory and kernel image name, 1952 /*
1718 * followed by command line arguments for the kernel. */ 1953 * After the other arguments we expect memory and kernel image name,
1954 * followed by command line arguments for the kernel.
1955 */
1719 if (optind + 2 > argc) 1956 if (optind + 2 > argc)
1720 usage(); 1957 usage();
1721 1958
@@ -1733,20 +1970,26 @@ int main(int argc, char *argv[])
1733 /* Map the initrd image if requested (at top of physical memory) */ 1970 /* Map the initrd image if requested (at top of physical memory) */
1734 if (initrd_name) { 1971 if (initrd_name) {
1735 initrd_size = load_initrd(initrd_name, mem); 1972 initrd_size = load_initrd(initrd_name, mem);
1736 /* These are the location in the Linux boot header where the 1973 /*
1737 * start and size of the initrd are expected to be found. */ 1974 * These are the location in the Linux boot header where the
1975 * start and size of the initrd are expected to be found.
1976 */
1738 boot->hdr.ramdisk_image = mem - initrd_size; 1977 boot->hdr.ramdisk_image = mem - initrd_size;
1739 boot->hdr.ramdisk_size = initrd_size; 1978 boot->hdr.ramdisk_size = initrd_size;
1740 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1979 /* The bootloader type 0xFF means "unknown"; that's OK. */
1741 boot->hdr.type_of_loader = 0xFF; 1980 boot->hdr.type_of_loader = 0xFF;
1742 } 1981 }
1743 1982
1744 /* The Linux boot header contains an "E820" memory map: ours is a 1983 /*
1745 * simple, single region. */ 1984 * The Linux boot header contains an "E820" memory map: ours is a
1985 * simple, single region.
1986 */
1746 boot->e820_entries = 1; 1987 boot->e820_entries = 1;
1747 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1988 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
1748 /* The boot header contains a command line pointer: we put the command 1989 /*
1749 * line after the boot header. */ 1990 * The boot header contains a command line pointer: we put the command
1991 * line after the boot header.
1992 */
1750 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1993 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1751 /* We use a simple helper to copy the arguments separated by spaces. */ 1994 /* We use a simple helper to copy the arguments separated by spaces. */
1752 concat((char *)(boot + 1), argv+optind+2); 1995 concat((char *)(boot + 1), argv+optind+2);
@@ -1760,11 +2003,13 @@ int main(int argc, char *argv[])
1760 /* Tell the entry path not to try to reload segment registers. */ 2003 /* Tell the entry path not to try to reload segment registers. */
1761 boot->hdr.loadflags |= KEEP_SEGMENTS; 2004 boot->hdr.loadflags |= KEEP_SEGMENTS;
1762 2005
1763 /* We tell the kernel to initialize the Guest: this returns the open 2006 /*
1764 * /dev/lguest file descriptor. */ 2007 * We tell the kernel to initialize the Guest: this returns the open
2008 * /dev/lguest file descriptor.
2009 */
1765 tell_kernel(start); 2010 tell_kernel(start);
1766 2011
1767 /* Ensure that we terminate if a child dies. */ 2012 /* Ensure that we terminate if a device-servicing child dies. */
1768 signal(SIGCHLD, kill_launcher); 2013 signal(SIGCHLD, kill_launcher);
1769 2014
1770 /* If we exit via err(), this kills all the threads, restores tty. */ 2015 /* If we exit via err(), this kills all the threads, restores tty. */
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index e20d913d5914..abf768c681e2 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -30,9 +30,9 @@ State
30The validator tracks lock-class usage history into 4n + 1 separate state bits: 30The validator tracks lock-class usage history into 4n + 1 separate state bits:
31 31
32- 'ever held in STATE context' 32- 'ever held in STATE context'
33- 'ever head as readlock in STATE context' 33- 'ever held as readlock in STATE context'
34- 'ever head with STATE enabled' 34- 'ever held with STATE enabled'
35- 'ever head as readlock with STATE enabled' 35- 'ever held as readlock with STATE enabled'
36 36
37Where STATE can be either one of (kernel/lockdep_states.h) 37Where STATE can be either one of (kernel/lockdep_states.h)
38 - hardirq 38 - hardirq
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 1634c6dcecae..50189bf07d53 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -60,6 +60,8 @@ framerelay.txt
60 - info on using Frame Relay/Data Link Connection Identifier (DLCI). 60 - info on using Frame Relay/Data Link Connection Identifier (DLCI).
61generic_netlink.txt 61generic_netlink.txt
62 - info on Generic Netlink 62 - info on Generic Netlink
63ieee802154.txt
64 - Linux IEEE 802.15.4 implementation, API and drivers
63ip-sysctl.txt 65ip-sysctl.txt
64 - /proc/sys/net/ipv4/* variables 66 - /proc/sys/net/ipv4/* variables
65ip_dynaddr.txt 67ip_dynaddr.txt
diff --git a/Documentation/networking/6pack.txt b/Documentation/networking/6pack.txt
index d0777a1200e1..8f339428fdf4 100644
--- a/Documentation/networking/6pack.txt
+++ b/Documentation/networking/6pack.txt
@@ -1,7 +1,7 @@
1This is the 6pack-mini-HOWTO, written by 1This is the 6pack-mini-HOWTO, written by
2 2
3Andreas Könsgen DG3KQ 3Andreas Könsgen DG3KQ
4Internet: ajk@iehk.rwth-aachen.de 4Internet: ajk@comnets.uni-bremen.de
5AMPR-net: dg3kq@db0pra.ampr.org 5AMPR-net: dg3kq@db0pra.ampr.org
6AX.25: dg3kq@db0ach.#nrw.deu.eu 6AX.25: dg3kq@db0ach.#nrw.deu.eu
7 7
diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt
index a0280ad2edc9..23c995e64032 100644
--- a/Documentation/networking/ieee802154.txt
+++ b/Documentation/networking/ieee802154.txt
@@ -22,7 +22,7 @@ int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0);
22..... 22.....
23 23
24The address family, socket addresses etc. are defined in the 24The address family, socket addresses etc. are defined in the
25include/net/ieee802154/af_ieee802154.h header or in the special header 25include/net/af_ieee802154.h header or in the special header
26in our userspace package (see either linux-zigbee sourceforge download page 26in our userspace package (see either linux-zigbee sourceforge download page
27or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee). 27or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee).
28 28
@@ -33,7 +33,7 @@ MLME - MAC Level Management
33============================ 33============================
34 34
35Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands. 35Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands.
36See the include/net/ieee802154/nl802154.h header. Our userspace tools package 36See the include/net/nl802154.h header. Our userspace tools package
37(see above) provides CLI configuration utility for radio interfaces and simple 37(see above) provides CLI configuration utility for radio interfaces and simple
38coordinator for IEEE 802.15.4 networks as an example users of MLME protocol. 38coordinator for IEEE 802.15.4 networks as an example users of MLME protocol.
39 39
@@ -54,10 +54,14 @@ Those types of devices require different approach to be hooked into Linux kernel
54HardMAC 54HardMAC
55======= 55=======
56 56
57See the header include/net/ieee802154/netdevice.h. You have to implement Linux 57See the header include/net/ieee802154_netdev.h. You have to implement Linux
58net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family 58net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family
59code via plain sk_buffs. The control block of sk_buffs will contain additional 59code via plain sk_buffs. On skb reception skb->cb must contain additional
60info as described in the struct ieee802154_mac_cb. 60info as described in the struct ieee802154_mac_cb. During packet transmission
61the skb->cb is used to provide additional data to device's header_ops->create
62function. Be aware, that this data can be overriden later (when socket code
63submits skb to qdisc), so if you need something from that cb later, you should
64store info in the skb->data on your own.
61 65
62To hook the MLME interface you have to populate the ml_priv field of your 66To hook the MLME interface you have to populate the ml_priv field of your
63net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are 67net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are
@@ -69,8 +73,8 @@ We provide an example of simple HardMAC driver at drivers/ieee802154/fakehard.c
69SoftMAC 73SoftMAC
70======= 74=======
71 75
72We are going to provide intermediate layer impelementing IEEE 802.15.4 MAC 76We are going to provide intermediate layer implementing IEEE 802.15.4 MAC
73in software. This is currently WIP. 77in software. This is currently WIP.
74 78
75See header include/net/ieee802154/mac802154.h and several drivers in 79See header include/net/mac802154.h and several drivers in drivers/ieee802154/.
76drivers/ieee802154/ 80
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8be76235fe67..fbe427a6580c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -311,9 +311,12 @@ tcp_no_metrics_save - BOOLEAN
311 connections. 311 connections.
312 312
313tcp_orphan_retries - INTEGER 313tcp_orphan_retries - INTEGER
314 How may times to retry before killing TCP connection, closed 314 This value influences the timeout of a locally closed TCP connection,
315 by our side. Default value 7 corresponds to ~50sec-16min 315 when RTO retransmissions remain unacknowledged.
316 depending on RTO. If you machine is loaded WEB server, 316 See tcp_retries2 for more details.
317
318 The default value is 7.
319 If your machine is a loaded WEB server,
317 you should think about lowering this value, such sockets 320 you should think about lowering this value, such sockets
318 may consume significant resources. Cf. tcp_max_orphans. 321 may consume significant resources. Cf. tcp_max_orphans.
319 322
@@ -327,16 +330,28 @@ tcp_retrans_collapse - BOOLEAN
327 certain TCP stacks. 330 certain TCP stacks.
328 331
329tcp_retries1 - INTEGER 332tcp_retries1 - INTEGER
330 How many times to retry before deciding that something is wrong 333 This value influences the time, after which TCP decides, that
331 and it is necessary to report this suspicion to network layer. 334 something is wrong due to unacknowledged RTO retransmissions,
332 Minimal RFC value is 3, it is default, which corresponds 335 and reports this suspicion to the network layer.
333 to ~3sec-8min depending on RTO. 336 See tcp_retries2 for more details.
337
338 RFC 1122 recommends at least 3 retransmissions, which is the
339 default.
334 340
335tcp_retries2 - INTEGER 341tcp_retries2 - INTEGER
336 How may times to retry before killing alive TCP connection. 342 This value influences the timeout of an alive TCP connection,
337 RFC1122 says that the limit should be longer than 100 sec. 343 when RTO retransmissions remain unacknowledged.
338 It is too small number. Default value 15 corresponds to ~13-30min 344 Given a value of N, a hypothetical TCP connection following
339 depending on RTO. 345 exponential backoff with an initial RTO of TCP_RTO_MIN would
346 retransmit N times before killing the connection at the (N+1)th RTO.
347
348 The default value of 15 yields a hypothetical timeout of 924.6
349 seconds and is a lower bound for the effective timeout.
350 TCP will effectively time out at the first RTO which exceeds the
351 hypothetical timeout.
352
353 RFC 1122 recommends at least 100 seconds for the timeout,
354 which corresponds to a value of at least 8.
340 355
341tcp_rfc1337 - BOOLEAN 356tcp_rfc1337 - BOOLEAN
342 If set, the TCP stack behaves conforming to RFC1337. If unset, 357 If set, the TCP stack behaves conforming to RFC1337. If unset,
@@ -1282,6 +1297,16 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
1282sctp_wmem - vector of 3 INTEGERs: min, default, max 1297sctp_wmem - vector of 3 INTEGERs: min, default, max
1283 See tcp_wmem for a description. 1298 See tcp_wmem for a description.
1284 1299
1300addr_scope_policy - INTEGER
1301 Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
1302
1303 0 - Disable IPv4 address scoping
1304 1 - Enable IPv4 address scoping
1305 2 - Follow draft but allow IPv4 private addresses
1306 3 - Follow draft but allow IPv4 link local addresses
1307
1308 Default: 1
1309
1285 1310
1286/proc/sys/net/core/* 1311/proc/sys/net/core/*
1287dev_weight - INTEGER 1312dev_weight - INTEGER
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
new file mode 100644
index 000000000000..f49a33b704d2
--- /dev/null
+++ b/Documentation/power/runtime_pm.txt
@@ -0,0 +1,378 @@
1Run-time Power Management Framework for I/O Devices
2
3(C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4
51. Introduction
6
7Support for run-time power management (run-time PM) of I/O devices is provided
8at the power management core (PM core) level by means of:
9
10* The power management workqueue pm_wq in which bus types and device drivers can
11 put their PM-related work items. It is strongly recommended that pm_wq be
12 used for queuing all work items related to run-time PM, because this allows
13 them to be synchronized with system-wide power transitions (suspend to RAM,
14 hibernation and resume from system sleep states). pm_wq is declared in
15 include/linux/pm_runtime.h and defined in kernel/power/main.c.
16
17* A number of run-time PM fields in the 'power' member of 'struct device' (which
18 is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
19 be used for synchronizing run-time PM operations with one another.
20
21* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in
22 include/linux/pm.h).
23
24* A set of helper functions defined in drivers/base/power/runtime.c that can be
25 used for carrying out run-time PM operations in such a way that the
26 synchronization between them is taken care of by the PM core. Bus types and
27 device drivers are encouraged to use these functions.
28
29The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM
30fields of 'struct dev_pm_info' and the core helper functions provided for
31run-time PM are described below.
32
332. Device Run-time PM Callbacks
34
35There are three device run-time PM callbacks defined in 'struct dev_pm_ops':
36
37struct dev_pm_ops {
38 ...
39 int (*runtime_suspend)(struct device *dev);
40 int (*runtime_resume)(struct device *dev);
41 void (*runtime_idle)(struct device *dev);
42 ...
43};
44
45The ->runtime_suspend() callback is executed by the PM core for the bus type of
46the device being suspended. The bus type's callback is then _entirely_
47_responsible_ for handling the device as appropriate, which may, but need not
48include executing the device driver's own ->runtime_suspend() callback (from the
49PM core's point of view it is not necessary to implement a ->runtime_suspend()
50callback in a device driver as long as the bus type's ->runtime_suspend() knows
51what to do to handle the device).
52
53 * Once the bus type's ->runtime_suspend() callback has completed successfully
54 for given device, the PM core regards the device as suspended, which need
55 not mean that the device has been put into a low power state. It is
56 supposed to mean, however, that the device will not process data and will
57 not communicate with the CPU(s) and RAM until its bus type's
58 ->runtime_resume() callback is executed for it. The run-time PM status of
59 a device after successful execution of its bus type's ->runtime_suspend()
60 callback is 'suspended'.
61
62 * If the bus type's ->runtime_suspend() callback returns -EBUSY or -EAGAIN,
63 the device's run-time PM status is supposed to be 'active', which means that
64 the device _must_ be fully operational afterwards.
65
66 * If the bus type's ->runtime_suspend() callback returns an error code
67 different from -EBUSY or -EAGAIN, the PM core regards this as a fatal
68 error and will refuse to run the helper functions described in Section 4
69 for the device, until the status of it is directly set either to 'active'
70 or to 'suspended' (the PM core provides special helper functions for this
71 purpose).
72
73In particular, if the driver requires remote wakeup capability for proper
74functioning and device_may_wakeup() returns 'false' for the device, then
75->runtime_suspend() should return -EBUSY. On the other hand, if
76device_may_wakeup() returns 'true' for the device and the device is put
77into a low power state during the execution of its bus type's
78->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism
79allowing the device to request a change of its power state, such as PCI PME)
80will be enabled for the device. Generally, remote wake-up should be enabled
81for all input devices put into a low power state at run time.
82
83The ->runtime_resume() callback is executed by the PM core for the bus type of
84the device being woken up. The bus type's callback is then _entirely_
85_responsible_ for handling the device as appropriate, which may, but need not
86include executing the device driver's own ->runtime_resume() callback (from the
87PM core's point of view it is not necessary to implement a ->runtime_resume()
88callback in a device driver as long as the bus type's ->runtime_resume() knows
89what to do to handle the device).
90
91 * Once the bus type's ->runtime_resume() callback has completed successfully,
92 the PM core regards the device as fully operational, which means that the
93 device _must_ be able to complete I/O operations as needed. The run-time
94 PM status of the device is then 'active'.
95
96 * If the bus type's ->runtime_resume() callback returns an error code, the PM
97 core regards this as a fatal error and will refuse to run the helper
98 functions described in Section 4 for the device, until its status is
99 directly set either to 'active' or to 'suspended' (the PM core provides
100 special helper functions for this purpose).
101
102The ->runtime_idle() callback is executed by the PM core for the bus type of
103given device whenever the device appears to be idle, which is indicated to the
104PM core by two counters, the device's usage counter and the counter of 'active'
105children of the device.
106
107 * If any of these counters is decreased using a helper function provided by
108 the PM core and it turns out to be equal to zero, the other counter is
109 checked. If that counter also is equal to zero, the PM core executes the
110 device bus type's ->runtime_idle() callback (with the device as an
111 argument).
112
113The action performed by a bus type's ->runtime_idle() callback is totally
114dependent on the bus type in question, but the expected and recommended action
115is to check if the device can be suspended (i.e. if all of the conditions
116necessary for suspending the device are satisfied) and to queue up a suspend
117request for the device in that case.
118
119The helper functions provided by the PM core, described in Section 4, guarantee
120that the following constraints are met with respect to the bus type's run-time
121PM callbacks:
122
123(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
124 ->runtime_suspend() in parallel with ->runtime_resume() or with another
125 instance of ->runtime_suspend() for the same device) with the exception that
126 ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
127 ->runtime_idle() (although ->runtime_idle() will not be started while any
128 of the other callbacks is being executed for the same device).
129
130(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
131 devices (i.e. the PM core will only execute ->runtime_idle() or
132 ->runtime_suspend() for the devices the run-time PM status of which is
133 'active').
134
135(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
136 the usage counter of which is equal to zero _and_ either the counter of
137 'active' children of which is equal to zero, or the 'power.ignore_children'
138 flag of which is set.
139
140(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the
141 PM core will only execute ->runtime_resume() for the devices the run-time
142 PM status of which is 'suspended').
143
144Additionally, the helper functions provided by the PM core obey the following
145rules:
146
147 * If ->runtime_suspend() is about to be executed or there's a pending request
148 to execute it, ->runtime_idle() will not be executed for the same device.
149
150 * A request to execute or to schedule the execution of ->runtime_suspend()
151 will cancel any pending requests to execute ->runtime_idle() for the same
152 device.
153
154 * If ->runtime_resume() is about to be executed or there's a pending request
155 to execute it, the other callbacks will not be executed for the same device.
156
157 * A request to execute ->runtime_resume() will cancel any pending or
158 scheduled requests to execute the other callbacks for the same device.
159
1603. Run-time PM Device Fields
161
162The following device run-time PM fields are present in 'struct dev_pm_info', as
163defined in include/linux/pm.h:
164
165 struct timer_list suspend_timer;
166 - timer used for scheduling (delayed) suspend request
167
168 unsigned long timer_expires;
169 - timer expiration time, in jiffies (if this is different from zero, the
170 timer is running and will expire at that time, otherwise the timer is not
171 running)
172
173 struct work_struct work;
174 - work structure used for queuing up requests (i.e. work items in pm_wq)
175
176 wait_queue_head_t wait_queue;
177 - wait queue used if any of the helper functions needs to wait for another
178 one to complete
179
180 spinlock_t lock;
181 - lock used for synchronisation
182
183 atomic_t usage_count;
184 - the usage counter of the device
185
186 atomic_t child_count;
187 - the count of 'active' children of the device
188
189 unsigned int ignore_children;
190 - if set, the value of child_count is ignored (but still updated)
191
192 unsigned int disable_depth;
193 - used for disabling the helper funcions (they work normally if this is
194 equal to zero); the initial value of it is 1 (i.e. run-time PM is
195 initially disabled for all devices)
196
197 unsigned int runtime_error;
198 - if set, there was a fatal error (one of the callbacks returned error code
199 as described in Section 2), so the helper funtions will not work until
200 this flag is cleared; this is the error code returned by the failing
201 callback
202
203 unsigned int idle_notification;
204 - if set, ->runtime_idle() is being executed
205
206 unsigned int request_pending;
207 - if set, there's a pending request (i.e. a work item queued up into pm_wq)
208
209 enum rpm_request request;
210 - type of request that's pending (valid if request_pending is set)
211
212 unsigned int deferred_resume;
213 - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
214 being executed for that device and it is not practical to wait for the
215 suspend to complete; means "start a resume as soon as you've suspended"
216
217 enum rpm_status runtime_status;
218 - the run-time PM status of the device; this field's initial value is
219 RPM_SUSPENDED, which means that each device is initially regarded by the
220 PM core as 'suspended', regardless of its real hardware status
221
222All of the above fields are members of the 'power' member of 'struct device'.
223
2244. Run-time PM Device Helper Functions
225
226The following run-time PM helper functions are defined in
227drivers/base/power/runtime.c and include/linux/pm_runtime.h:
228
229 void pm_runtime_init(struct device *dev);
230 - initialize the device run-time PM fields in 'struct dev_pm_info'
231
232 void pm_runtime_remove(struct device *dev);
233 - make sure that the run-time PM of the device will be disabled after
234 removing the device from device hierarchy
235
236 int pm_runtime_idle(struct device *dev);
237 - execute ->runtime_idle() for the device's bus type; returns 0 on success
238 or error code on failure, where -EINPROGRESS means that ->runtime_idle()
239 is already being executed
240
241 int pm_runtime_suspend(struct device *dev);
242 - execute ->runtime_suspend() for the device's bus type; returns 0 on
243 success, 1 if the device's run-time PM status was already 'suspended', or
244 error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
245 to suspend the device again in future
246
247 int pm_runtime_resume(struct device *dev);
248 - execute ->runtime_resume() for the device's bus type; returns 0 on
249 success, 1 if the device's run-time PM status was already 'active' or
250 error code on failure, where -EAGAIN means it may be safe to attempt to
251 resume the device again in future, but 'power.runtime_error' should be
252 checked additionally
253
254 int pm_request_idle(struct device *dev);
255 - submit a request to execute ->runtime_idle() for the device's bus type
256 (the request is represented by a work item in pm_wq); returns 0 on success
257 or error code if the request has not been queued up
258
259 int pm_schedule_suspend(struct device *dev, unsigned int delay);
260 - schedule the execution of ->runtime_suspend() for the device's bus type
261 in future, where 'delay' is the time to wait before queuing up a suspend
262 work item in pm_wq, in milliseconds (if 'delay' is zero, the work item is
263 queued up immediately); returns 0 on success, 1 if the device's PM
264 run-time status was already 'suspended', or error code if the request
265 hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
266 ->runtime_suspend() is already scheduled and not yet expired, the new
267 value of 'delay' will be used as the time to wait
268
269 int pm_request_resume(struct device *dev);
270 - submit a request to execute ->runtime_resume() for the device's bus type
271 (the request is represented by a work item in pm_wq); returns 0 on
272 success, 1 if the device's run-time PM status was already 'active', or
273 error code if the request hasn't been queued up
274
275 void pm_runtime_get_noresume(struct device *dev);
276 - increment the device's usage counter
277
278 int pm_runtime_get(struct device *dev);
279 - increment the device's usage counter, run pm_request_resume(dev) and
280 return its result
281
282 int pm_runtime_get_sync(struct device *dev);
283 - increment the device's usage counter, run pm_runtime_resume(dev) and
284 return its result
285
286 void pm_runtime_put_noidle(struct device *dev);
287 - decrement the device's usage counter
288
289 int pm_runtime_put(struct device *dev);
290 - decrement the device's usage counter, run pm_request_idle(dev) and return
291 its result
292
293 int pm_runtime_put_sync(struct device *dev);
294 - decrement the device's usage counter, run pm_runtime_idle(dev) and return
295 its result
296
297 void pm_runtime_enable(struct device *dev);
298 - enable the run-time PM helper functions to run the device bus type's
299 run-time PM callbacks described in Section 2
300
301 int pm_runtime_disable(struct device *dev);
302 - prevent the run-time PM helper functions from running the device bus
303 type's run-time PM callbacks, make sure that all of the pending run-time
304 PM operations on the device are either completed or canceled; returns
305 1 if there was a resume request pending and it was necessary to execute
306 ->runtime_resume() for the device's bus type to satisfy that request,
307 otherwise 0 is returned
308
309 void pm_suspend_ignore_children(struct device *dev, bool enable);
310 - set/unset the power.ignore_children flag of the device
311
312 int pm_runtime_set_active(struct device *dev);
313 - clear the device's 'power.runtime_error' flag, set the device's run-time
314 PM status to 'active' and update its parent's counter of 'active'
315 children as appropriate (it is only valid to use this function if
316 'power.runtime_error' is set or 'power.disable_depth' is greater than
317 zero); it will fail and return error code if the device has a parent
318 which is not active and the 'power.ignore_children' flag of which is unset
319
320 void pm_runtime_set_suspended(struct device *dev);
321 - clear the device's 'power.runtime_error' flag, set the device's run-time
322 PM status to 'suspended' and update its parent's counter of 'active'
323 children as appropriate (it is only valid to use this function if
324 'power.runtime_error' is set or 'power.disable_depth' is greater than
325 zero)
326
327It is safe to execute the following helper functions from interrupt context:
328
329pm_request_idle()
330pm_schedule_suspend()
331pm_request_resume()
332pm_runtime_get_noresume()
333pm_runtime_get()
334pm_runtime_put_noidle()
335pm_runtime_put()
336pm_suspend_ignore_children()
337pm_runtime_set_active()
338pm_runtime_set_suspended()
339pm_runtime_enable()
340
3415. Run-time PM Initialization, Device Probing and Removal
342
343Initially, the run-time PM is disabled for all devices, which means that the
344majority of the run-time PM helper funtions described in Section 4 will return
345-EAGAIN until pm_runtime_enable() is called for the device.
346
347In addition to that, the initial run-time PM status of all devices is
348'suspended', but it need not reflect the actual physical state of the device.
349Thus, if the device is initially active (i.e. it is able to process I/O), its
350run-time PM status must be changed to 'active', with the help of
351pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
352
353However, if the device has a parent and the parent's run-time PM is enabled,
354calling pm_runtime_set_active() for the device will affect the parent, unless
355the parent's 'power.ignore_children' flag is set. Namely, in that case the
356parent won't be able to suspend at run time, using the PM core's helper
357functions, as long as the child's status is 'active', even if the child's
358run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
359the child yet or pm_runtime_disable() has been called for it). For this reason,
360once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
361should be called for it too as soon as reasonably possible or its run-time PM
362status should be changed back to 'suspended' with the help of
363pm_runtime_set_suspended().
364
365If the default initial run-time PM status of the device (i.e. 'suspended')
366reflects the actual state of the device, its bus type's or its driver's
367->probe() callback will likely need to wake it up using one of the PM core's
368helper functions described in Section 4. In that case, pm_runtime_resume()
369should be used. Of course, for this purpose the device's run-time PM has to be
370enabled earlier by calling pm_runtime_enable().
371
372If the device bus type's or driver's ->probe() or ->remove() callback runs
373pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts,
374they will fail returning -EAGAIN, because the device's usage counter is
375incremented by the core before executing ->probe() and ->remove(). Still, it
376may be desirable to suspend the device as soon as ->probe() or ->remove() has
377finished, so the PM core uses pm_runtime_idle_sync() to invoke the device bus
378type's ->runtime_idle() callback at that time.
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 8d999d862d0e..79f533f38c61 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1238,1122 +1238,7 @@ descriptions for the SOC devices for which new nodes have been
1238defined; this list will expand as more and more SOC-containing 1238defined; this list will expand as more and more SOC-containing
1239platforms are moved over to use the flattened-device-tree model. 1239platforms are moved over to use the flattened-device-tree model.
1240 1240
1241 a) PHY nodes 1241VII - Specifying interrupt information for devices
1242
1243 Required properties:
1244
1245 - device_type : Should be "ethernet-phy"
1246 - interrupts : <a b> where a is the interrupt number and b is a
1247 field that represents an encoding of the sense and level
1248 information for the interrupt. This should be encoded based on
1249 the information in section 2) depending on the type of interrupt
1250 controller you have.
1251 - interrupt-parent : the phandle for the interrupt controller that
1252 services interrupts for this device.
1253 - reg : The ID number for the phy, usually a small integer
1254 - linux,phandle : phandle for this node; likely referenced by an
1255 ethernet controller node.
1256
1257
1258 Example:
1259
1260 ethernet-phy@0 {
1261 linux,phandle = <2452000>
1262 interrupt-parent = <40000>;
1263 interrupts = <35 1>;
1264 reg = <0>;
1265 device_type = "ethernet-phy";
1266 };
1267
1268
1269 b) Interrupt controllers
1270
1271 Some SOC devices contain interrupt controllers that are different
1272 from the standard Open PIC specification. The SOC device nodes for
1273 these types of controllers should be specified just like a standard
1274 OpenPIC controller. Sense and level information should be encoded
1275 as specified in section 2) of this chapter for each device that
1276 specifies an interrupt.
1277
1278 Example :
1279
1280 pic@40000 {
1281 linux,phandle = <40000>;
1282 interrupt-controller;
1283 #address-cells = <0>;
1284 reg = <40000 40000>;
1285 compatible = "chrp,open-pic";
1286 device_type = "open-pic";
1287 };
1288
1289 c) 4xx/Axon EMAC ethernet nodes
1290
1291 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
1292 the Axon bridge. To operate this needs to interact with a ths
1293 special McMAL DMA controller, and sometimes an RGMII or ZMII
1294 interface. In addition to the nodes and properties described
1295 below, the node for the OPB bus on which the EMAC sits must have a
1296 correct clock-frequency property.
1297
1298 i) The EMAC node itself
1299
1300 Required properties:
1301 - device_type : "network"
1302
1303 - compatible : compatible list, contains 2 entries, first is
1304 "ibm,emac-CHIP" where CHIP is the host ASIC (440gx,
1305 405gp, Axon) and second is either "ibm,emac" or
1306 "ibm,emac4". For Axon, thus, we have: "ibm,emac-axon",
1307 "ibm,emac4"
1308 - interrupts : <interrupt mapping for EMAC IRQ and WOL IRQ>
1309 - interrupt-parent : optional, if needed for interrupt mapping
1310 - reg : <registers mapping>
1311 - local-mac-address : 6 bytes, MAC address
1312 - mal-device : phandle of the associated McMAL node
1313 - mal-tx-channel : 1 cell, index of the tx channel on McMAL associated
1314 with this EMAC
1315 - mal-rx-channel : 1 cell, index of the rx channel on McMAL associated
1316 with this EMAC
1317 - cell-index : 1 cell, hardware index of the EMAC cell on a given
1318 ASIC (typically 0x0 and 0x1 for EMAC0 and EMAC1 on
1319 each Axon chip)
1320 - max-frame-size : 1 cell, maximum frame size supported in bytes
1321 - rx-fifo-size : 1 cell, Rx fifo size in bytes for 10 and 100 Mb/sec
1322 operations.
1323 For Axon, 2048
1324 - tx-fifo-size : 1 cell, Tx fifo size in bytes for 10 and 100 Mb/sec
1325 operations.
1326 For Axon, 2048.
1327 - fifo-entry-size : 1 cell, size of a fifo entry (used to calculate
1328 thresholds).
1329 For Axon, 0x00000010
1330 - mal-burst-size : 1 cell, MAL burst size (used to calculate thresholds)
1331 in bytes.
1332 For Axon, 0x00000100 (I think ...)
1333 - phy-mode : string, mode of operations of the PHY interface.
1334 Supported values are: "mii", "rmii", "smii", "rgmii",
1335 "tbi", "gmii", rtbi", "sgmii".
1336 For Axon on CAB, it is "rgmii"
1337 - mdio-device : 1 cell, required iff using shared MDIO registers
1338 (440EP). phandle of the EMAC to use to drive the
1339 MDIO lines for the PHY used by this EMAC.
1340 - zmii-device : 1 cell, required iff connected to a ZMII. phandle of
1341 the ZMII device node
1342 - zmii-channel : 1 cell, required iff connected to a ZMII. Which ZMII
1343 channel or 0xffffffff if ZMII is only used for MDIO.
1344 - rgmii-device : 1 cell, required iff connected to an RGMII. phandle
1345 of the RGMII device node.
1346 For Axon: phandle of plb5/plb4/opb/rgmii
1347 - rgmii-channel : 1 cell, required iff connected to an RGMII. Which
1348 RGMII channel is used by this EMAC.
1349 Fox Axon: present, whatever value is appropriate for each
1350 EMAC, that is the content of the current (bogus) "phy-port"
1351 property.
1352
1353 Optional properties:
1354 - phy-address : 1 cell, optional, MDIO address of the PHY. If absent,
1355 a search is performed.
1356 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
1357 for, used if phy-address is absent. bit 0x00000001 is
1358 MDIO address 0.
1359 For Axon it can be absent, though my current driver
1360 doesn't handle phy-address yet so for now, keep
1361 0x00ffffff in it.
1362 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
1363 operations (if absent the value is the same as
1364 rx-fifo-size). For Axon, either absent or 2048.
1365 - tx-fifo-size-gige : 1 cell, Tx fifo size in bytes for 1000 Mb/sec
1366 operations (if absent the value is the same as
1367 tx-fifo-size). For Axon, either absent or 2048.
1368 - tah-device : 1 cell, optional. If connected to a TAH engine for
1369 offload, phandle of the TAH device node.
1370 - tah-channel : 1 cell, optional. If appropriate, channel used on the
1371 TAH engine.
1372
1373 Example:
1374
1375 EMAC0: ethernet@40000800 {
1376 device_type = "network";
1377 compatible = "ibm,emac-440gp", "ibm,emac";
1378 interrupt-parent = <&UIC1>;
1379 interrupts = <1c 4 1d 4>;
1380 reg = <40000800 70>;
1381 local-mac-address = [00 04 AC E3 1B 1E];
1382 mal-device = <&MAL0>;
1383 mal-tx-channel = <0 1>;
1384 mal-rx-channel = <0>;
1385 cell-index = <0>;
1386 max-frame-size = <5dc>;
1387 rx-fifo-size = <1000>;
1388 tx-fifo-size = <800>;
1389 phy-mode = "rmii";
1390 phy-map = <00000001>;
1391 zmii-device = <&ZMII0>;
1392 zmii-channel = <0>;
1393 };
1394
1395 ii) McMAL node
1396
1397 Required properties:
1398 - device_type : "dma-controller"
1399 - compatible : compatible list, containing 2 entries, first is
1400 "ibm,mcmal-CHIP" where CHIP is the host ASIC (like
1401 emac) and the second is either "ibm,mcmal" or
1402 "ibm,mcmal2".
1403 For Axon, "ibm,mcmal-axon","ibm,mcmal2"
1404 - interrupts : <interrupt mapping for the MAL interrupts sources:
1405 5 sources: tx_eob, rx_eob, serr, txde, rxde>.
1406 For Axon: This is _different_ from the current
1407 firmware. We use the "delayed" interrupts for txeob
1408 and rxeob. Thus we end up with mapping those 5 MPIC
1409 interrupts, all level positive sensitive: 10, 11, 32,
1410 33, 34 (in decimal)
1411 - dcr-reg : < DCR registers range >
1412 - dcr-parent : if needed for dcr-reg
1413 - num-tx-chans : 1 cell, number of Tx channels
1414 - num-rx-chans : 1 cell, number of Rx channels
1415
1416 iii) ZMII node
1417
1418 Required properties:
1419 - compatible : compatible list, containing 2 entries, first is
1420 "ibm,zmii-CHIP" where CHIP is the host ASIC (like
1421 EMAC) and the second is "ibm,zmii".
1422 For Axon, there is no ZMII node.
1423 - reg : <registers mapping>
1424
1425 iv) RGMII node
1426
1427 Required properties:
1428 - compatible : compatible list, containing 2 entries, first is
1429 "ibm,rgmii-CHIP" where CHIP is the host ASIC (like
1430 EMAC) and the second is "ibm,rgmii".
1431 For Axon, "ibm,rgmii-axon","ibm,rgmii"
1432 - reg : <registers mapping>
1433 - revision : as provided by the RGMII new version register if
1434 available.
1435 For Axon: 0x0000012a
1436
1437 d) Xilinx IP cores
1438
1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
1441 of standard device types (network, serial, etc.) and miscellaneous
1442 devices (gpio, LCD, spi, etc). Also, since these devices are
1443 implemented within the fpga fabric every instance of the device can be
1444 synthesised with different options that change the behaviour.
1445
1446 Each IP-core has a set of parameters which the FPGA designer can use to
1447 control how the core is synthesized. Historically, the EDK tool would
1448 extract the device parameters relevant to device drivers and copy them
1449 into an 'xparameters.h' in the form of #define symbols. This tells the
1450 device drivers how the IP cores are configured, but it requres the kernel
1451 to be recompiled every time the FPGA bitstream is resynthesized.
1452
1453 The new approach is to export the parameters into the device tree and
1454 generate a new device tree each time the FPGA bitstream changes. The
1455 parameters which used to be exported as #defines will now become
1456 properties of the device node. In general, device nodes for IP-cores
1457 will take the following form:
1458
1459 (name): (generic-name)@(base-address) {
1460 compatible = "xlnx,(ip-core-name)-(HW_VER)"
1461 [, (list of compatible devices), ...];
1462 reg = <(baseaddr) (size)>;
1463 interrupt-parent = <&interrupt-controller-phandle>;
1464 interrupts = < ... >;
1465 xlnx,(parameter1) = "(string-value)";
1466 xlnx,(parameter2) = <(int-value)>;
1467 };
1468
1469 (generic-name): an open firmware-style name that describes the
1470 generic class of device. Preferably, this is one word, such
1471 as 'serial' or 'ethernet'.
1472 (ip-core-name): the name of the ip block (given after the BEGIN
1473 directive in system.mhs). Should be in lowercase
1474 and all underscores '_' converted to dashes '-'.
1475 (name): is derived from the "PARAMETER INSTANCE" value.
1476 (parameter#): C_* parameters from system.mhs. The C_ prefix is
1477 dropped from the parameter name, the name is converted
1478 to lowercase and all underscore '_' characters are
1479 converted to dashes '-'.
1480 (baseaddr): the baseaddr parameter value (often named C_BASEADDR).
1481 (HW_VER): from the HW_VER parameter.
1482 (size): the address range size (often C_HIGHADDR - C_BASEADDR + 1).
1483
1484 Typically, the compatible list will include the exact IP core version
1485 followed by an older IP core version which implements the same
1486 interface or any other device with the same interface.
1487
1488 'reg', 'interrupt-parent' and 'interrupts' are all optional properties.
1489
1490 For example, the following block from system.mhs:
1491
1492 BEGIN opb_uartlite
1493 PARAMETER INSTANCE = opb_uartlite_0
1494 PARAMETER HW_VER = 1.00.b
1495 PARAMETER C_BAUDRATE = 115200
1496 PARAMETER C_DATA_BITS = 8
1497 PARAMETER C_ODD_PARITY = 0
1498 PARAMETER C_USE_PARITY = 0
1499 PARAMETER C_CLK_FREQ = 50000000
1500 PARAMETER C_BASEADDR = 0xEC100000
1501 PARAMETER C_HIGHADDR = 0xEC10FFFF
1502 BUS_INTERFACE SOPB = opb_7
1503 PORT OPB_Clk = CLK_50MHz
1504 PORT Interrupt = opb_uartlite_0_Interrupt
1505 PORT RX = opb_uartlite_0_RX
1506 PORT TX = opb_uartlite_0_TX
1507 PORT OPB_Rst = sys_bus_reset_0
1508 END
1509
1510 becomes the following device tree node:
1511
1512 opb_uartlite_0: serial@ec100000 {
1513 device_type = "serial";
1514 compatible = "xlnx,opb-uartlite-1.00.b";
1515 reg = <ec100000 10000>;
1516 interrupt-parent = <&opb_intc_0>;
1517 interrupts = <1 0>; // got this from the opb_intc parameters
1518 current-speed = <d#115200>; // standard serial device prop
1519 clock-frequency = <d#50000000>; // standard serial device prop
1520 xlnx,data-bits = <8>;
1521 xlnx,odd-parity = <0>;
1522 xlnx,use-parity = <0>;
1523 };
1524
1525 Some IP cores actually implement 2 or more logical devices. In
1526 this case, the device should still describe the whole IP core with
1527 a single node and add a child node for each logical device. The
1528 ranges property can be used to translate from parent IP-core to the
1529 registers of each device. In addition, the parent node should be
1530 compatible with the bus type 'xlnx,compound', and should contain
1531 #address-cells and #size-cells, as with any other bus. (Note: this
1532 makes the assumption that both logical devices have the same bus
1533 binding. If this is not true, then separate nodes should be used
1534 for each logical device). The 'cell-index' property can be used to
1535 enumerate logical devices within an IP core. For example, the
1536 following is the system.mhs entry for the dual ps2 controller found
1537 on the ml403 reference design.
1538
1539 BEGIN opb_ps2_dual_ref
1540 PARAMETER INSTANCE = opb_ps2_dual_ref_0
1541 PARAMETER HW_VER = 1.00.a
1542 PARAMETER C_BASEADDR = 0xA9000000
1543 PARAMETER C_HIGHADDR = 0xA9001FFF
1544 BUS_INTERFACE SOPB = opb_v20_0
1545 PORT Sys_Intr1 = ps2_1_intr
1546 PORT Sys_Intr2 = ps2_2_intr
1547 PORT Clkin1 = ps2_clk_rx_1
1548 PORT Clkin2 = ps2_clk_rx_2
1549 PORT Clkpd1 = ps2_clk_tx_1
1550 PORT Clkpd2 = ps2_clk_tx_2
1551 PORT Rx1 = ps2_d_rx_1
1552 PORT Rx2 = ps2_d_rx_2
1553 PORT Txpd1 = ps2_d_tx_1
1554 PORT Txpd2 = ps2_d_tx_2
1555 END
1556
1557 It would result in the following device tree nodes:
1558
1559 opb_ps2_dual_ref_0: opb-ps2-dual-ref@a9000000 {
1560 #address-cells = <1>;
1561 #size-cells = <1>;
1562 compatible = "xlnx,compound";
1563 ranges = <0 a9000000 2000>;
1564 // If this device had extra parameters, then they would
1565 // go here.
1566 ps2@0 {
1567 compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
1568 reg = <0 40>;
1569 interrupt-parent = <&opb_intc_0>;
1570 interrupts = <3 0>;
1571 cell-index = <0>;
1572 };
1573 ps2@1000 {
1574 compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
1575 reg = <1000 40>;
1576 interrupt-parent = <&opb_intc_0>;
1577 interrupts = <3 0>;
1578 cell-index = <0>;
1579 };
1580 };
1581
1582 Also, the system.mhs file defines bus attachments from the processor
1583 to the devices. The device tree structure should reflect the bus
1584 attachments. Again an example; this system.mhs fragment:
1585
1586 BEGIN ppc405_virtex4
1587 PARAMETER INSTANCE = ppc405_0
1588 PARAMETER HW_VER = 1.01.a
1589 BUS_INTERFACE DPLB = plb_v34_0
1590 BUS_INTERFACE IPLB = plb_v34_0
1591 END
1592
1593 BEGIN opb_intc
1594 PARAMETER INSTANCE = opb_intc_0
1595 PARAMETER HW_VER = 1.00.c
1596 PARAMETER C_BASEADDR = 0xD1000FC0
1597 PARAMETER C_HIGHADDR = 0xD1000FDF
1598 BUS_INTERFACE SOPB = opb_v20_0
1599 END
1600
1601 BEGIN opb_uart16550
1602 PARAMETER INSTANCE = opb_uart16550_0
1603 PARAMETER HW_VER = 1.00.d
1604 PARAMETER C_BASEADDR = 0xa0000000
1605 PARAMETER C_HIGHADDR = 0xa0001FFF
1606 BUS_INTERFACE SOPB = opb_v20_0
1607 END
1608
1609 BEGIN plb_v34
1610 PARAMETER INSTANCE = plb_v34_0
1611 PARAMETER HW_VER = 1.02.a
1612 END
1613
1614 BEGIN plb_bram_if_cntlr
1615 PARAMETER INSTANCE = plb_bram_if_cntlr_0
1616 PARAMETER HW_VER = 1.00.b
1617 PARAMETER C_BASEADDR = 0xFFFF0000
1618 PARAMETER C_HIGHADDR = 0xFFFFFFFF
1619 BUS_INTERFACE SPLB = plb_v34_0
1620 END
1621
1622 BEGIN plb2opb_bridge
1623 PARAMETER INSTANCE = plb2opb_bridge_0
1624 PARAMETER HW_VER = 1.01.a
1625 PARAMETER C_RNG0_BASEADDR = 0x20000000
1626 PARAMETER C_RNG0_HIGHADDR = 0x3FFFFFFF
1627 PARAMETER C_RNG1_BASEADDR = 0x60000000
1628 PARAMETER C_RNG1_HIGHADDR = 0x7FFFFFFF
1629 PARAMETER C_RNG2_BASEADDR = 0x80000000
1630 PARAMETER C_RNG2_HIGHADDR = 0xBFFFFFFF
1631 PARAMETER C_RNG3_BASEADDR = 0xC0000000
1632 PARAMETER C_RNG3_HIGHADDR = 0xDFFFFFFF
1633 BUS_INTERFACE SPLB = plb_v34_0
1634 BUS_INTERFACE MOPB = opb_v20_0
1635 END
1636
1637 Gives this device tree (some properties removed for clarity):
1638
1639 plb@0 {
1640 #address-cells = <1>;
1641 #size-cells = <1>;
1642 compatible = "xlnx,plb-v34-1.02.a";
1643 device_type = "ibm,plb";
1644 ranges; // 1:1 translation
1645
1646 plb_bram_if_cntrl_0: bram@ffff0000 {
1647 reg = <ffff0000 10000>;
1648 }
1649
1650 opb@20000000 {
1651 #address-cells = <1>;
1652 #size-cells = <1>;
1653 ranges = <20000000 20000000 20000000
1654 60000000 60000000 20000000
1655 80000000 80000000 40000000
1656 c0000000 c0000000 20000000>;
1657
1658 opb_uart16550_0: serial@a0000000 {
1659 reg = <a00000000 2000>;
1660 };
1661
1662 opb_intc_0: interrupt-controller@d1000fc0 {
1663 reg = <d1000fc0 20>;
1664 };
1665 };
1666 };
1667
1668 That covers the general approach to binding xilinx IP cores into the
1669 device tree. The following are bindings for specific devices:
1670
1671 i) Xilinx ML300 Framebuffer
1672
1673 Simple framebuffer device from the ML300 reference design (also on the
1674 ML403 reference design as well as others).
1675
1676 Optional properties:
1677 - resolution = <xres yres> : pixel resolution of framebuffer. Some
1678 implementations use a different resolution.
1679 Default is <d#640 d#480>
1680 - virt-resolution = <xvirt yvirt> : Size of framebuffer in memory.
1681 Default is <d#1024 d#480>.
1682 - rotate-display (empty) : rotate display 180 degrees.
1683
1684 ii) Xilinx SystemACE
1685
1686 The Xilinx SystemACE device is used to program FPGAs from an FPGA
1687 bitstream stored on a CF card. It can also be used as a generic CF
1688 interface device.
1689
1690 Optional properties:
1691 - 8-bit (empty) : Set this property for SystemACE in 8 bit mode
1692
1693 iii) Xilinx EMAC and Xilinx TEMAC
1694
1695 Xilinx Ethernet devices. In addition to general xilinx properties
1696 listed above, nodes for these devices should include a phy-handle
1697 property, and may include other common network device properties
1698 like local-mac-address.
1699
1700 iv) Xilinx Uartlite
1701
1702 Xilinx uartlite devices are simple fixed speed serial ports.
1703
1704 Required properties:
1705 - current-speed : Baud rate of uartlite
1706
1707 v) Xilinx hwicap
1708
1709 Xilinx hwicap devices provide access to the configuration logic
1710 of the FPGA through the Internal Configuration Access Port
1711 (ICAP). The ICAP enables partial reconfiguration of the FPGA,
1712 readback of the configuration information, and some control over
1713 'warm boots' of the FPGA fabric.
1714
1715 Required properties:
1716 - xlnx,family : The family of the FPGA, necessary since the
1717 capabilities of the underlying ICAP hardware
1718 differ between different families. May be
1719 'virtex2p', 'virtex4', or 'virtex5'.
1720
1721 vi) Xilinx Uart 16550
1722
1723 Xilinx UART 16550 devices are very similar to the NS16550 but with
1724 different register spacing and an offset from the base address.
1725
1726 Required properties:
1727 - clock-frequency : Frequency of the clock input
1728 - reg-offset : A value of 3 is required
1729 - reg-shift : A value of 2 is required
1730
1731 e) USB EHCI controllers
1732
1733 Required properties:
1734 - compatible : should be "usb-ehci".
1735 - reg : should contain at least address and length of the standard EHCI
1736 register set for the device. Optional platform-dependent registers
1737 (debug-port or other) can be also specified here, but only after
1738 definition of standard EHCI registers.
1739 - interrupts : one EHCI interrupt should be described here.
1740 If device registers are implemented in big endian mode, the device
1741 node should have "big-endian-regs" property.
1742 If controller implementation operates with big endian descriptors,
1743 "big-endian-desc" property should be specified.
1744 If both big endian registers and descriptors are used by the controller
1745 implementation, "big-endian" property can be specified instead of having
1746 both "big-endian-regs" and "big-endian-desc".
1747
1748 Example (Sequoia 440EPx):
1749 ehci@e0000300 {
1750 compatible = "ibm,usb-ehci-440epx", "usb-ehci";
1751 interrupt-parent = <&UIC0>;
1752 interrupts = <1a 4>;
1753 reg = <0 e0000300 90 0 e0000390 70>;
1754 big-endian;
1755 };
1756
1757 f) MDIO on GPIOs
1758
1759 Currently defined compatibles:
1760 - virtual,gpio-mdio
1761
1762 MDC and MDIO lines connected to GPIO controllers are listed in the
1763 gpios property as described in section VIII.1 in the following order:
1764
1765 MDC, MDIO.
1766
1767 Example:
1768
1769 mdio {
1770 compatible = "virtual,mdio-gpio";
1771 #address-cells = <1>;
1772 #size-cells = <0>;
1773 gpios = <&qe_pio_a 11
1774 &qe_pio_c 6>;
1775 };
1776
1777 g) SPI (Serial Peripheral Interface) busses
1778
1779 SPI busses can be described with a node for the SPI master device
1780 and a set of child nodes for each SPI slave on the bus. For this
1781 discussion, it is assumed that the system's SPI controller is in
1782 SPI master mode. This binding does not describe SPI controllers
1783 in slave mode.
1784
1785 The SPI master node requires the following properties:
1786 - #address-cells - number of cells required to define a chip select
1787 address on the SPI bus.
1788 - #size-cells - should be zero.
1789 - compatible - name of SPI bus controller following generic names
1790 recommended practice.
1791 No other properties are required in the SPI bus node. It is assumed
1792 that a driver for an SPI bus device will understand that it is an SPI bus.
1793 However, the binding does not attempt to define the specific method for
1794 assigning chip select numbers. Since SPI chip select configuration is
1795 flexible and non-standardized, it is left out of this binding with the
1796 assumption that board specific platform code will be used to manage
1797 chip selects. Individual drivers can define additional properties to
1798 support describing the chip select layout.
1799
1800 SPI slave nodes must be children of the SPI master node and can
1801 contain the following properties.
1802 - reg - (required) chip select address of device.
1803 - compatible - (required) name of SPI device following generic names
1804 recommended practice
1805 - spi-max-frequency - (required) Maximum SPI clocking speed of device in Hz
1806 - spi-cpol - (optional) Empty property indicating device requires
1807 inverse clock polarity (CPOL) mode
1808 - spi-cpha - (optional) Empty property indicating device requires
1809 shifted clock phase (CPHA) mode
1810 - spi-cs-high - (optional) Empty property indicating device requires
1811 chip select active high
1812
1813 SPI example for an MPC5200 SPI bus:
1814 spi@f00 {
1815 #address-cells = <1>;
1816 #size-cells = <0>;
1817 compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
1818 reg = <0xf00 0x20>;
1819 interrupts = <2 13 0 2 14 0>;
1820 interrupt-parent = <&mpc5200_pic>;
1821
1822 ethernet-switch@0 {
1823 compatible = "micrel,ks8995m";
1824 spi-max-frequency = <1000000>;
1825 reg = <0>;
1826 };
1827
1828 codec@1 {
1829 compatible = "ti,tlv320aic26";
1830 spi-max-frequency = <100000>;
1831 reg = <1>;
1832 };
1833 };
1834
1835VII - Marvell Discovery mv64[345]6x System Controller chips
1836===========================================================
1837
1838The Marvell mv64[345]60 series of system controller chips contain
1839many of the peripherals needed to implement a complete computer
1840system. In this section, we define device tree nodes to describe
1841the system controller chip itself and each of the peripherals
1842which it contains. Compatible string values for each node are
1843prefixed with the string "marvell,", for Marvell Technology Group Ltd.
1844
18451) The /system-controller node
1846
1847 This node is used to represent the system-controller and must be
1848 present when the system uses a system controller chip. The top-level
1849 system-controller node contains information that is global to all
1850 devices within the system controller chip. The node name begins
1851 with "system-controller" followed by the unit address, which is
1852 the base address of the memory-mapped register set for the system
1853 controller chip.
1854
1855 Required properties:
1856
1857 - ranges : Describes the translation of system controller addresses
1858 for memory mapped registers.
1859 - clock-frequency: Contains the main clock frequency for the system
1860 controller chip.
1861 - reg : This property defines the address and size of the
1862 memory-mapped registers contained within the system controller
1863 chip. The address specified in the "reg" property should match
1864 the unit address of the system-controller node.
1865 - #address-cells : Address representation for system controller
1866 devices. This field represents the number of cells needed to
1867 represent the address of the memory-mapped registers of devices
1868 within the system controller chip.
1869 - #size-cells : Size representation for for the memory-mapped
1870 registers within the system controller chip.
1871 - #interrupt-cells : Defines the width of cells used to represent
1872 interrupts.
1873
1874 Optional properties:
1875
1876 - model : The specific model of the system controller chip. Such
1877 as, "mv64360", "mv64460", or "mv64560".
1878 - compatible : A string identifying the compatibility identifiers
1879 of the system controller chip.
1880
1881 The system-controller node contains child nodes for each system
1882 controller device that the platform uses. Nodes should not be created
1883 for devices which exist on the system controller chip but are not used
1884
1885 Example Marvell Discovery mv64360 system-controller node:
1886
1887 system-controller@f1000000 { /* Marvell Discovery mv64360 */
1888 #address-cells = <1>;
1889 #size-cells = <1>;
1890 model = "mv64360"; /* Default */
1891 compatible = "marvell,mv64360";
1892 clock-frequency = <133333333>;
1893 reg = <0xf1000000 0x10000>;
1894 virtual-reg = <0xf1000000>;
1895 ranges = <0x88000000 0x88000000 0x1000000 /* PCI 0 I/O Space */
1896 0x80000000 0x80000000 0x8000000 /* PCI 0 MEM Space */
1897 0xa0000000 0xa0000000 0x4000000 /* User FLASH */
1898 0x00000000 0xf1000000 0x0010000 /* Bridge's regs */
1899 0xf2000000 0xf2000000 0x0040000>;/* Integrated SRAM */
1900
1901 [ child node definitions... ]
1902 }
1903
19042) Child nodes of /system-controller
1905
1906 a) Marvell Discovery MDIO bus
1907
1908 The MDIO is a bus to which the PHY devices are connected. For each
1909 device that exists on this bus, a child node should be created. See
1910 the definition of the PHY node below for an example of how to define
1911 a PHY.
1912
1913 Required properties:
1914 - #address-cells : Should be <1>
1915 - #size-cells : Should be <0>
1916 - device_type : Should be "mdio"
1917 - compatible : Should be "marvell,mv64360-mdio"
1918
1919 Example:
1920
1921 mdio {
1922 #address-cells = <1>;
1923 #size-cells = <0>;
1924 device_type = "mdio";
1925 compatible = "marvell,mv64360-mdio";
1926
1927 ethernet-phy@0 {
1928 ......
1929 };
1930 };
1931
1932
1933 b) Marvell Discovery ethernet controller
1934
1935 The Discover ethernet controller is described with two levels
1936 of nodes. The first level describes an ethernet silicon block
1937 and the second level describes up to 3 ethernet nodes within
1938 that block. The reason for the multiple levels is that the
1939 registers for the node are interleaved within a single set
1940 of registers. The "ethernet-block" level describes the
1941 shared register set, and the "ethernet" nodes describe ethernet
1942 port-specific properties.
1943
1944 Ethernet block node
1945
1946 Required properties:
1947 - #address-cells : <1>
1948 - #size-cells : <0>
1949 - compatible : "marvell,mv64360-eth-block"
1950 - reg : Offset and length of the register set for this block
1951
1952 Example Discovery Ethernet block node:
1953 ethernet-block@2000 {
1954 #address-cells = <1>;
1955 #size-cells = <0>;
1956 compatible = "marvell,mv64360-eth-block";
1957 reg = <0x2000 0x2000>;
1958 ethernet@0 {
1959 .......
1960 };
1961 };
1962
1963 Ethernet port node
1964
1965 Required properties:
1966 - device_type : Should be "network".
1967 - compatible : Should be "marvell,mv64360-eth".
1968 - reg : Should be <0>, <1>, or <2>, according to which registers
1969 within the silicon block the device uses.
1970 - interrupts : <a> where a is the interrupt number for the port.
1971 - interrupt-parent : the phandle for the interrupt controller
1972 that services interrupts for this device.
1973 - phy : the phandle for the PHY connected to this ethernet
1974 controller.
1975 - local-mac-address : 6 bytes, MAC address
1976
1977 Example Discovery Ethernet port node:
1978 ethernet@0 {
1979 device_type = "network";
1980 compatible = "marvell,mv64360-eth";
1981 reg = <0>;
1982 interrupts = <32>;
1983 interrupt-parent = <&PIC>;
1984 phy = <&PHY0>;
1985 local-mac-address = [ 00 00 00 00 00 00 ];
1986 };
1987
1988
1989
1990 c) Marvell Discovery PHY nodes
1991
1992 Required properties:
1993 - device_type : Should be "ethernet-phy"
1994 - interrupts : <a> where a is the interrupt number for this phy.
1995 - interrupt-parent : the phandle for the interrupt controller that
1996 services interrupts for this device.
1997 - reg : The ID number for the phy, usually a small integer
1998
1999 Example Discovery PHY node:
2000 ethernet-phy@1 {
2001 device_type = "ethernet-phy";
2002 compatible = "broadcom,bcm5421";
2003 interrupts = <76>; /* GPP 12 */
2004 interrupt-parent = <&PIC>;
2005 reg = <1>;
2006 };
2007
2008
2009 d) Marvell Discovery SDMA nodes
2010
2011 Represent DMA hardware associated with the MPSC (multiprotocol
2012 serial controllers).
2013
2014 Required properties:
2015 - compatible : "marvell,mv64360-sdma"
2016 - reg : Offset and length of the register set for this device
2017 - interrupts : <a> where a is the interrupt number for the DMA
2018 device.
2019 - interrupt-parent : the phandle for the interrupt controller
2020 that services interrupts for this device.
2021
2022 Example Discovery SDMA node:
2023 sdma@4000 {
2024 compatible = "marvell,mv64360-sdma";
2025 reg = <0x4000 0xc18>;
2026 virtual-reg = <0xf1004000>;
2027 interrupts = <36>;
2028 interrupt-parent = <&PIC>;
2029 };
2030
2031
2032 e) Marvell Discovery BRG nodes
2033
2034 Represent baud rate generator hardware associated with the MPSC
2035 (multiprotocol serial controllers).
2036
2037 Required properties:
2038 - compatible : "marvell,mv64360-brg"
2039 - reg : Offset and length of the register set for this device
2040 - clock-src : A value from 0 to 15 which selects the clock
2041 source for the baud rate generator. This value corresponds
2042 to the CLKS value in the BRGx configuration register. See
2043 the mv64x60 User's Manual.
2044 - clock-frequence : The frequency (in Hz) of the baud rate
2045 generator's input clock.
2046 - current-speed : The current speed setting (presumably by
2047 firmware) of the baud rate generator.
2048
2049 Example Discovery BRG node:
2050 brg@b200 {
2051 compatible = "marvell,mv64360-brg";
2052 reg = <0xb200 0x8>;
2053 clock-src = <8>;
2054 clock-frequency = <133333333>;
2055 current-speed = <9600>;
2056 };
2057
2058
2059 f) Marvell Discovery CUNIT nodes
2060
2061 Represent the Serial Communications Unit device hardware.
2062
2063 Required properties:
2064 - reg : Offset and length of the register set for this device
2065
2066 Example Discovery CUNIT node:
2067 cunit@f200 {
2068 reg = <0xf200 0x200>;
2069 };
2070
2071
2072 g) Marvell Discovery MPSCROUTING nodes
2073
2074 Represent the Discovery's MPSC routing hardware
2075
2076 Required properties:
2077 - reg : Offset and length of the register set for this device
2078
2079 Example Discovery CUNIT node:
2080 mpscrouting@b500 {
2081 reg = <0xb400 0xc>;
2082 };
2083
2084
2085 h) Marvell Discovery MPSCINTR nodes
2086
2087 Represent the Discovery's MPSC DMA interrupt hardware registers
2088 (SDMA cause and mask registers).
2089
2090 Required properties:
2091 - reg : Offset and length of the register set for this device
2092
2093 Example Discovery MPSCINTR node:
2094 mpsintr@b800 {
2095 reg = <0xb800 0x100>;
2096 };
2097
2098
2099 i) Marvell Discovery MPSC nodes
2100
2101 Represent the Discovery's MPSC (Multiprotocol Serial Controller)
2102 serial port.
2103
2104 Required properties:
2105 - device_type : "serial"
2106 - compatible : "marvell,mv64360-mpsc"
2107 - reg : Offset and length of the register set for this device
2108 - sdma : the phandle for the SDMA node used by this port
2109 - brg : the phandle for the BRG node used by this port
2110 - cunit : the phandle for the CUNIT node used by this port
2111 - mpscrouting : the phandle for the MPSCROUTING node used by this port
2112 - mpscintr : the phandle for the MPSCINTR node used by this port
2113 - cell-index : the hardware index of this cell in the MPSC core
2114 - max_idle : value needed for MPSC CHR3 (Maximum Frame Length)
2115 register
2116 - interrupts : <a> where a is the interrupt number for the MPSC.
2117 - interrupt-parent : the phandle for the interrupt controller
2118 that services interrupts for this device.
2119
2120 Example Discovery MPSCINTR node:
2121 mpsc@8000 {
2122 device_type = "serial";
2123 compatible = "marvell,mv64360-mpsc";
2124 reg = <0x8000 0x38>;
2125 virtual-reg = <0xf1008000>;
2126 sdma = <&SDMA0>;
2127 brg = <&BRG0>;
2128 cunit = <&CUNIT>;
2129 mpscrouting = <&MPSCROUTING>;
2130 mpscintr = <&MPSCINTR>;
2131 cell-index = <0>;
2132 max_idle = <40>;
2133 interrupts = <40>;
2134 interrupt-parent = <&PIC>;
2135 };
2136
2137
2138 j) Marvell Discovery Watch Dog Timer nodes
2139
2140 Represent the Discovery's watchdog timer hardware
2141
2142 Required properties:
2143 - compatible : "marvell,mv64360-wdt"
2144 - reg : Offset and length of the register set for this device
2145
2146 Example Discovery Watch Dog Timer node:
2147 wdt@b410 {
2148 compatible = "marvell,mv64360-wdt";
2149 reg = <0xb410 0x8>;
2150 };
2151
2152
2153 k) Marvell Discovery I2C nodes
2154
2155 Represent the Discovery's I2C hardware
2156
2157 Required properties:
2158 - device_type : "i2c"
2159 - compatible : "marvell,mv64360-i2c"
2160 - reg : Offset and length of the register set for this device
2161 - interrupts : <a> where a is the interrupt number for the I2C.
2162 - interrupt-parent : the phandle for the interrupt controller
2163 that services interrupts for this device.
2164
2165 Example Discovery I2C node:
2166 compatible = "marvell,mv64360-i2c";
2167 reg = <0xc000 0x20>;
2168 virtual-reg = <0xf100c000>;
2169 interrupts = <37>;
2170 interrupt-parent = <&PIC>;
2171 };
2172
2173
2174 l) Marvell Discovery PIC (Programmable Interrupt Controller) nodes
2175
2176 Represent the Discovery's PIC hardware
2177
2178 Required properties:
2179 - #interrupt-cells : <1>
2180 - #address-cells : <0>
2181 - compatible : "marvell,mv64360-pic"
2182 - reg : Offset and length of the register set for this device
2183 - interrupt-controller
2184
2185 Example Discovery PIC node:
2186 pic {
2187 #interrupt-cells = <1>;
2188 #address-cells = <0>;
2189 compatible = "marvell,mv64360-pic";
2190 reg = <0x0 0x88>;
2191 interrupt-controller;
2192 };
2193
2194
2195 m) Marvell Discovery MPP (Multipurpose Pins) multiplexing nodes
2196
2197 Represent the Discovery's MPP hardware
2198
2199 Required properties:
2200 - compatible : "marvell,mv64360-mpp"
2201 - reg : Offset and length of the register set for this device
2202
2203 Example Discovery MPP node:
2204 mpp@f000 {
2205 compatible = "marvell,mv64360-mpp";
2206 reg = <0xf000 0x10>;
2207 };
2208
2209
2210 n) Marvell Discovery GPP (General Purpose Pins) nodes
2211
2212 Represent the Discovery's GPP hardware
2213
2214 Required properties:
2215 - compatible : "marvell,mv64360-gpp"
2216 - reg : Offset and length of the register set for this device
2217
2218 Example Discovery GPP node:
2219 gpp@f000 {
2220 compatible = "marvell,mv64360-gpp";
2221 reg = <0xf100 0x20>;
2222 };
2223
2224
2225 o) Marvell Discovery PCI host bridge node
2226
2227 Represents the Discovery's PCI host bridge device. The properties
2228 for this node conform to Rev 2.1 of the PCI Bus Binding to IEEE
2229 1275-1994. A typical value for the compatible property is
2230 "marvell,mv64360-pci".
2231
2232 Example Discovery PCI host bridge node
2233 pci@80000000 {
2234 #address-cells = <3>;
2235 #size-cells = <2>;
2236 #interrupt-cells = <1>;
2237 device_type = "pci";
2238 compatible = "marvell,mv64360-pci";
2239 reg = <0xcf8 0x8>;
2240 ranges = <0x01000000 0x0 0x0
2241 0x88000000 0x0 0x01000000
2242 0x02000000 0x0 0x80000000
2243 0x80000000 0x0 0x08000000>;
2244 bus-range = <0 255>;
2245 clock-frequency = <66000000>;
2246 interrupt-parent = <&PIC>;
2247 interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
2248 interrupt-map = <
2249 /* IDSEL 0x0a */
2250 0x5000 0 0 1 &PIC 80
2251 0x5000 0 0 2 &PIC 81
2252 0x5000 0 0 3 &PIC 91
2253 0x5000 0 0 4 &PIC 93
2254
2255 /* IDSEL 0x0b */
2256 0x5800 0 0 1 &PIC 91
2257 0x5800 0 0 2 &PIC 93
2258 0x5800 0 0 3 &PIC 80
2259 0x5800 0 0 4 &PIC 81
2260
2261 /* IDSEL 0x0c */
2262 0x6000 0 0 1 &PIC 91
2263 0x6000 0 0 2 &PIC 93
2264 0x6000 0 0 3 &PIC 80
2265 0x6000 0 0 4 &PIC 81
2266
2267 /* IDSEL 0x0d */
2268 0x6800 0 0 1 &PIC 93
2269 0x6800 0 0 2 &PIC 80
2270 0x6800 0 0 3 &PIC 81
2271 0x6800 0 0 4 &PIC 91
2272 >;
2273 };
2274
2275
2276 p) Marvell Discovery CPU Error nodes
2277
2278 Represent the Discovery's CPU error handler device.
2279
2280 Required properties:
2281 - compatible : "marvell,mv64360-cpu-error"
2282 - reg : Offset and length of the register set for this device
2283 - interrupts : the interrupt number for this device
2284 - interrupt-parent : the phandle for the interrupt controller
2285 that services interrupts for this device.
2286
2287 Example Discovery CPU Error node:
2288 cpu-error@0070 {
2289 compatible = "marvell,mv64360-cpu-error";
2290 reg = <0x70 0x10 0x128 0x28>;
2291 interrupts = <3>;
2292 interrupt-parent = <&PIC>;
2293 };
2294
2295
2296 q) Marvell Discovery SRAM Controller nodes
2297
2298 Represent the Discovery's SRAM controller device.
2299
2300 Required properties:
2301 - compatible : "marvell,mv64360-sram-ctrl"
2302 - reg : Offset and length of the register set for this device
2303 - interrupts : the interrupt number for this device
2304 - interrupt-parent : the phandle for the interrupt controller
2305 that services interrupts for this device.
2306
2307 Example Discovery SRAM Controller node:
2308 sram-ctrl@0380 {
2309 compatible = "marvell,mv64360-sram-ctrl";
2310 reg = <0x380 0x80>;
2311 interrupts = <13>;
2312 interrupt-parent = <&PIC>;
2313 };
2314
2315
2316 r) Marvell Discovery PCI Error Handler nodes
2317
2318 Represent the Discovery's PCI error handler device.
2319
2320 Required properties:
2321 - compatible : "marvell,mv64360-pci-error"
2322 - reg : Offset and length of the register set for this device
2323 - interrupts : the interrupt number for this device
2324 - interrupt-parent : the phandle for the interrupt controller
2325 that services interrupts for this device.
2326
2327 Example Discovery PCI Error Handler node:
2328 pci-error@1d40 {
2329 compatible = "marvell,mv64360-pci-error";
2330 reg = <0x1d40 0x40 0xc28 0x4>;
2331 interrupts = <12>;
2332 interrupt-parent = <&PIC>;
2333 };
2334
2335
2336 s) Marvell Discovery Memory Controller nodes
2337
2338 Represent the Discovery's memory controller device.
2339
2340 Required properties:
2341 - compatible : "marvell,mv64360-mem-ctrl"
2342 - reg : Offset and length of the register set for this device
2343 - interrupts : the interrupt number for this device
2344 - interrupt-parent : the phandle for the interrupt controller
2345 that services interrupts for this device.
2346
2347 Example Discovery Memory Controller node:
2348 mem-ctrl@1400 {
2349 compatible = "marvell,mv64360-mem-ctrl";
2350 reg = <0x1400 0x60>;
2351 interrupts = <17>;
2352 interrupt-parent = <&PIC>;
2353 };
2354
2355
2356VIII - Specifying interrupt information for devices
2357=================================================== 1242===================================================
2358 1243
2359The device tree represents the busses and devices of a hardware 1244The device tree represents the busses and devices of a hardware
@@ -2439,56 +1324,7 @@ encodings listed below:
2439 2 = high to low edge sensitive type enabled 1324 2 = high to low edge sensitive type enabled
2440 3 = low to high edge sensitive type enabled 1325 3 = low to high edge sensitive type enabled
2441 1326
2442IX - Specifying GPIO information for devices 1327VIII - Specifying Device Power Management Information (sleep property)
2443============================================
2444
24451) gpios property
2446-----------------
2447
2448Nodes that makes use of GPIOs should define them using `gpios' property,
2449format of which is: <&gpio-controller1-phandle gpio1-specifier
2450 &gpio-controller2-phandle gpio2-specifier
2451 0 /* holes are permitted, means no GPIO 3 */
2452 &gpio-controller4-phandle gpio4-specifier
2453 ...>;
2454
2455Note that gpio-specifier length is controller dependent.
2456
2457gpio-specifier may encode: bank, pin position inside the bank,
2458whether pin is open-drain and whether pin is logically inverted.
2459
2460Example of the node using GPIOs:
2461
2462 node {
2463 gpios = <&qe_pio_e 18 0>;
2464 };
2465
2466In this example gpio-specifier is "18 0" and encodes GPIO pin number,
2467and empty GPIO flags as accepted by the "qe_pio_e" gpio-controller.
2468
24692) gpio-controller nodes
2470------------------------
2471
2472Every GPIO controller node must have #gpio-cells property defined,
2473this information will be used to translate gpio-specifiers.
2474
2475Example of two SOC GPIO banks defined as gpio-controller nodes:
2476
2477 qe_pio_a: gpio-controller@1400 {
2478 #gpio-cells = <2>;
2479 compatible = "fsl,qe-pario-bank-a", "fsl,qe-pario-bank";
2480 reg = <0x1400 0x18>;
2481 gpio-controller;
2482 };
2483
2484 qe_pio_e: gpio-controller@1460 {
2485 #gpio-cells = <2>;
2486 compatible = "fsl,qe-pario-bank-e", "fsl,qe-pario-bank";
2487 reg = <0x1460 0x18>;
2488 gpio-controller;
2489 };
2490
2491X - Specifying Device Power Management Information (sleep property)
2492=================================================================== 1328===================================================================
2493 1329
2494Devices on SOCs often have mechanisms for placing devices into low-power 1330Devices on SOCs often have mechanisms for placing devices into low-power
diff --git a/Documentation/powerpc/dts-bindings/4xx/emac.txt b/Documentation/powerpc/dts-bindings/4xx/emac.txt
new file mode 100644
index 000000000000..2161334a7ca5
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/4xx/emac.txt
@@ -0,0 +1,148 @@
1 4xx/Axon EMAC ethernet nodes
2
3 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
4 the Axon bridge. To operate this needs to interact with a ths
5 special McMAL DMA controller, and sometimes an RGMII or ZMII
6 interface. In addition to the nodes and properties described
7 below, the node for the OPB bus on which the EMAC sits must have a
8 correct clock-frequency property.
9
10 i) The EMAC node itself
11
12 Required properties:
13 - device_type : "network"
14
15 - compatible : compatible list, contains 2 entries, first is
16 "ibm,emac-CHIP" where CHIP is the host ASIC (440gx,
17 405gp, Axon) and second is either "ibm,emac" or
18 "ibm,emac4". For Axon, thus, we have: "ibm,emac-axon",
19 "ibm,emac4"
20 - interrupts : <interrupt mapping for EMAC IRQ and WOL IRQ>
21 - interrupt-parent : optional, if needed for interrupt mapping
22 - reg : <registers mapping>
23 - local-mac-address : 6 bytes, MAC address
24 - mal-device : phandle of the associated McMAL node
25 - mal-tx-channel : 1 cell, index of the tx channel on McMAL associated
26 with this EMAC
27 - mal-rx-channel : 1 cell, index of the rx channel on McMAL associated
28 with this EMAC
29 - cell-index : 1 cell, hardware index of the EMAC cell on a given
30 ASIC (typically 0x0 and 0x1 for EMAC0 and EMAC1 on
31 each Axon chip)
32 - max-frame-size : 1 cell, maximum frame size supported in bytes
33 - rx-fifo-size : 1 cell, Rx fifo size in bytes for 10 and 100 Mb/sec
34 operations.
35 For Axon, 2048
36 - tx-fifo-size : 1 cell, Tx fifo size in bytes for 10 and 100 Mb/sec
37 operations.
38 For Axon, 2048.
39 - fifo-entry-size : 1 cell, size of a fifo entry (used to calculate
40 thresholds).
41 For Axon, 0x00000010
42 - mal-burst-size : 1 cell, MAL burst size (used to calculate thresholds)
43 in bytes.
44 For Axon, 0x00000100 (I think ...)
45 - phy-mode : string, mode of operations of the PHY interface.
46 Supported values are: "mii", "rmii", "smii", "rgmii",
47 "tbi", "gmii", rtbi", "sgmii".
48 For Axon on CAB, it is "rgmii"
49 - mdio-device : 1 cell, required iff using shared MDIO registers
50 (440EP). phandle of the EMAC to use to drive the
51 MDIO lines for the PHY used by this EMAC.
52 - zmii-device : 1 cell, required iff connected to a ZMII. phandle of
53 the ZMII device node
54 - zmii-channel : 1 cell, required iff connected to a ZMII. Which ZMII
55 channel or 0xffffffff if ZMII is only used for MDIO.
56 - rgmii-device : 1 cell, required iff connected to an RGMII. phandle
57 of the RGMII device node.
58 For Axon: phandle of plb5/plb4/opb/rgmii
59 - rgmii-channel : 1 cell, required iff connected to an RGMII. Which
60 RGMII channel is used by this EMAC.
61 Fox Axon: present, whatever value is appropriate for each
62 EMAC, that is the content of the current (bogus) "phy-port"
63 property.
64
65 Optional properties:
66 - phy-address : 1 cell, optional, MDIO address of the PHY. If absent,
67 a search is performed.
68 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
69 for, used if phy-address is absent. bit 0x00000001 is
70 MDIO address 0.
71 For Axon it can be absent, though my current driver
72 doesn't handle phy-address yet so for now, keep
73 0x00ffffff in it.
74 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
75 operations (if absent the value is the same as
76 rx-fifo-size). For Axon, either absent or 2048.
77 - tx-fifo-size-gige : 1 cell, Tx fifo size in bytes for 1000 Mb/sec
78 operations (if absent the value is the same as
79 tx-fifo-size). For Axon, either absent or 2048.
80 - tah-device : 1 cell, optional. If connected to a TAH engine for
81 offload, phandle of the TAH device node.
82 - tah-channel : 1 cell, optional. If appropriate, channel used on the
83 TAH engine.
84
85 Example:
86
87 EMAC0: ethernet@40000800 {
88 device_type = "network";
89 compatible = "ibm,emac-440gp", "ibm,emac";
90 interrupt-parent = <&UIC1>;
91 interrupts = <1c 4 1d 4>;
92 reg = <40000800 70>;
93 local-mac-address = [00 04 AC E3 1B 1E];
94 mal-device = <&MAL0>;
95 mal-tx-channel = <0 1>;
96 mal-rx-channel = <0>;
97 cell-index = <0>;
98 max-frame-size = <5dc>;
99 rx-fifo-size = <1000>;
100 tx-fifo-size = <800>;
101 phy-mode = "rmii";
102 phy-map = <00000001>;
103 zmii-device = <&ZMII0>;
104 zmii-channel = <0>;
105 };
106
107 ii) McMAL node
108
109 Required properties:
110 - device_type : "dma-controller"
111 - compatible : compatible list, containing 2 entries, first is
112 "ibm,mcmal-CHIP" where CHIP is the host ASIC (like
113 emac) and the second is either "ibm,mcmal" or
114 "ibm,mcmal2".
115 For Axon, "ibm,mcmal-axon","ibm,mcmal2"
116 - interrupts : <interrupt mapping for the MAL interrupts sources:
117 5 sources: tx_eob, rx_eob, serr, txde, rxde>.
118 For Axon: This is _different_ from the current
119 firmware. We use the "delayed" interrupts for txeob
120 and rxeob. Thus we end up with mapping those 5 MPIC
121 interrupts, all level positive sensitive: 10, 11, 32,
122 33, 34 (in decimal)
123 - dcr-reg : < DCR registers range >
124 - dcr-parent : if needed for dcr-reg
125 - num-tx-chans : 1 cell, number of Tx channels
126 - num-rx-chans : 1 cell, number of Rx channels
127
128 iii) ZMII node
129
130 Required properties:
131 - compatible : compatible list, containing 2 entries, first is
132 "ibm,zmii-CHIP" where CHIP is the host ASIC (like
133 EMAC) and the second is "ibm,zmii".
134 For Axon, there is no ZMII node.
135 - reg : <registers mapping>
136
137 iv) RGMII node
138
139 Required properties:
140 - compatible : compatible list, containing 2 entries, first is
141 "ibm,rgmii-CHIP" where CHIP is the host ASIC (like
142 EMAC) and the second is "ibm,rgmii".
143 For Axon, "ibm,rgmii-axon","ibm,rgmii"
144 - reg : <registers mapping>
145 - revision : as provided by the RGMII new version register if
146 available.
147 For Axon: 0x0000012a
148
diff --git a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
index 5093ddf900da..3ed3797b5086 100644
--- a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
@@ -10,6 +10,8 @@ Required properties:
10 - interrupts : should contain eSDHC interrupt. 10 - interrupts : should contain eSDHC interrupt.
11 - interrupt-parent : interrupt source phandle. 11 - interrupt-parent : interrupt source phandle.
12 - clock-frequency : specifies eSDHC base clock frequency. 12 - clock-frequency : specifies eSDHC base clock frequency.
13 - sdhci,1-bit-only : (optional) specifies that a controller can
14 only handle 1-bit data transfers.
13 15
14Example: 16Example:
15 17
diff --git a/Documentation/powerpc/dts-bindings/gpio/gpio.txt b/Documentation/powerpc/dts-bindings/gpio/gpio.txt
new file mode 100644
index 000000000000..edaa84d288a1
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/gpio/gpio.txt
@@ -0,0 +1,50 @@
1Specifying GPIO information for devices
2============================================
3
41) gpios property
5-----------------
6
7Nodes that makes use of GPIOs should define them using `gpios' property,
8format of which is: <&gpio-controller1-phandle gpio1-specifier
9 &gpio-controller2-phandle gpio2-specifier
10 0 /* holes are permitted, means no GPIO 3 */
11 &gpio-controller4-phandle gpio4-specifier
12 ...>;
13
14Note that gpio-specifier length is controller dependent.
15
16gpio-specifier may encode: bank, pin position inside the bank,
17whether pin is open-drain and whether pin is logically inverted.
18
19Example of the node using GPIOs:
20
21 node {
22 gpios = <&qe_pio_e 18 0>;
23 };
24
25In this example gpio-specifier is "18 0" and encodes GPIO pin number,
26and empty GPIO flags as accepted by the "qe_pio_e" gpio-controller.
27
282) gpio-controller nodes
29------------------------
30
31Every GPIO controller node must have #gpio-cells property defined,
32this information will be used to translate gpio-specifiers.
33
34Example of two SOC GPIO banks defined as gpio-controller nodes:
35
36 qe_pio_a: gpio-controller@1400 {
37 #gpio-cells = <2>;
38 compatible = "fsl,qe-pario-bank-a", "fsl,qe-pario-bank";
39 reg = <0x1400 0x18>;
40 gpio-controller;
41 };
42
43 qe_pio_e: gpio-controller@1460 {
44 #gpio-cells = <2>;
45 compatible = "fsl,qe-pario-bank-e", "fsl,qe-pario-bank";
46 reg = <0x1460 0x18>;
47 gpio-controller;
48 };
49
50
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
index 4fe14deedc0a..064db928c3c1 100644
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -16,10 +16,17 @@ LED sub-node properties:
16 string defining the trigger assigned to the LED. Current triggers are: 16 string defining the trigger assigned to the LED. Current triggers are:
17 "backlight" - LED will act as a back-light, controlled by the framebuffer 17 "backlight" - LED will act as a back-light, controlled by the framebuffer
18 system 18 system
19 "default-on" - LED will turn on 19 "default-on" - LED will turn on, but see "default-state" below
20 "heartbeat" - LED "double" flashes at a load average based rate 20 "heartbeat" - LED "double" flashes at a load average based rate
21 "ide-disk" - LED indicates disk activity 21 "ide-disk" - LED indicates disk activity
22 "timer" - LED flashes at a fixed, configurable rate 22 "timer" - LED flashes at a fixed, configurable rate
23- default-state: (optional) The initial state of the LED. Valid
24 values are "on", "off", and "keep". If the LED is already on or off
25 and the default-state property is set the to same value, then no
26 glitch should be produced where the LED momentarily turns off (or
27 on). The "keep" setting will keep the LED at whatever its current
28 state is, without producing a glitch. The default is off if this
29 property is not present.
23 30
24Examples: 31Examples:
25 32
@@ -30,14 +37,22 @@ leds {
30 gpios = <&mcu_pio 0 1>; /* Active low */ 37 gpios = <&mcu_pio 0 1>; /* Active low */
31 linux,default-trigger = "ide-disk"; 38 linux,default-trigger = "ide-disk";
32 }; 39 };
40
41 fault {
42 gpios = <&mcu_pio 1 0>;
43 /* Keep LED on if BIOS detected hardware fault */
44 default-state = "keep";
45 };
33}; 46};
34 47
35run-control { 48run-control {
36 compatible = "gpio-leds"; 49 compatible = "gpio-leds";
37 red { 50 red {
38 gpios = <&mpc8572 6 0>; 51 gpios = <&mpc8572 6 0>;
52 default-state = "off";
39 }; 53 };
40 green { 54 green {
41 gpios = <&mpc8572 7 0>; 55 gpios = <&mpc8572 7 0>;
56 default-state = "on";
42 }; 57 };
43} 58}
diff --git a/Documentation/powerpc/dts-bindings/gpio/mdio.txt b/Documentation/powerpc/dts-bindings/gpio/mdio.txt
new file mode 100644
index 000000000000..bc9549529014
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/gpio/mdio.txt
@@ -0,0 +1,19 @@
1MDIO on GPIOs
2
3Currently defined compatibles:
4- virtual,gpio-mdio
5
6MDC and MDIO lines connected to GPIO controllers are listed in the
7gpios property as described in section VIII.1 in the following order:
8
9MDC, MDIO.
10
11Example:
12
13mdio {
14 compatible = "virtual,mdio-gpio";
15 #address-cells = <1>;
16 #size-cells = <0>;
17 gpios = <&qe_pio_a 11
18 &qe_pio_c 6>;
19};
diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt
new file mode 100644
index 000000000000..3708a2fd4747
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/marvell.txt
@@ -0,0 +1,521 @@
1Marvell Discovery mv64[345]6x System Controller chips
2===========================================================
3
4The Marvell mv64[345]60 series of system controller chips contain
5many of the peripherals needed to implement a complete computer
6system. In this section, we define device tree nodes to describe
7the system controller chip itself and each of the peripherals
8which it contains. Compatible string values for each node are
9prefixed with the string "marvell,", for Marvell Technology Group Ltd.
10
111) The /system-controller node
12
13 This node is used to represent the system-controller and must be
14 present when the system uses a system controller chip. The top-level
15 system-controller node contains information that is global to all
16 devices within the system controller chip. The node name begins
17 with "system-controller" followed by the unit address, which is
18 the base address of the memory-mapped register set for the system
19 controller chip.
20
21 Required properties:
22
23 - ranges : Describes the translation of system controller addresses
24 for memory mapped registers.
25 - clock-frequency: Contains the main clock frequency for the system
26 controller chip.
27 - reg : This property defines the address and size of the
28 memory-mapped registers contained within the system controller
29 chip. The address specified in the "reg" property should match
30 the unit address of the system-controller node.
31 - #address-cells : Address representation for system controller
32 devices. This field represents the number of cells needed to
33 represent the address of the memory-mapped registers of devices
34 within the system controller chip.
35 - #size-cells : Size representation for for the memory-mapped
36 registers within the system controller chip.
37 - #interrupt-cells : Defines the width of cells used to represent
38 interrupts.
39
40 Optional properties:
41
42 - model : The specific model of the system controller chip. Such
43 as, "mv64360", "mv64460", or "mv64560".
44 - compatible : A string identifying the compatibility identifiers
45 of the system controller chip.
46
47 The system-controller node contains child nodes for each system
48 controller device that the platform uses. Nodes should not be created
49 for devices which exist on the system controller chip but are not used
50
51 Example Marvell Discovery mv64360 system-controller node:
52
53 system-controller@f1000000 { /* Marvell Discovery mv64360 */
54 #address-cells = <1>;
55 #size-cells = <1>;
56 model = "mv64360"; /* Default */
57 compatible = "marvell,mv64360";
58 clock-frequency = <133333333>;
59 reg = <0xf1000000 0x10000>;
60 virtual-reg = <0xf1000000>;
61 ranges = <0x88000000 0x88000000 0x1000000 /* PCI 0 I/O Space */
62 0x80000000 0x80000000 0x8000000 /* PCI 0 MEM Space */
63 0xa0000000 0xa0000000 0x4000000 /* User FLASH */
64 0x00000000 0xf1000000 0x0010000 /* Bridge's regs */
65 0xf2000000 0xf2000000 0x0040000>;/* Integrated SRAM */
66
67 [ child node definitions... ]
68 }
69
702) Child nodes of /system-controller
71
72 a) Marvell Discovery MDIO bus
73
74 The MDIO is a bus to which the PHY devices are connected. For each
75 device that exists on this bus, a child node should be created. See
76 the definition of the PHY node below for an example of how to define
77 a PHY.
78
79 Required properties:
80 - #address-cells : Should be <1>
81 - #size-cells : Should be <0>
82 - device_type : Should be "mdio"
83 - compatible : Should be "marvell,mv64360-mdio"
84
85 Example:
86
87 mdio {
88 #address-cells = <1>;
89 #size-cells = <0>;
90 device_type = "mdio";
91 compatible = "marvell,mv64360-mdio";
92
93 ethernet-phy@0 {
94 ......
95 };
96 };
97
98
99 b) Marvell Discovery ethernet controller
100
101 The Discover ethernet controller is described with two levels
102 of nodes. The first level describes an ethernet silicon block
103 and the second level describes up to 3 ethernet nodes within
104 that block. The reason for the multiple levels is that the
105 registers for the node are interleaved within a single set
106 of registers. The "ethernet-block" level describes the
107 shared register set, and the "ethernet" nodes describe ethernet
108 port-specific properties.
109
110 Ethernet block node
111
112 Required properties:
113 - #address-cells : <1>
114 - #size-cells : <0>
115 - compatible : "marvell,mv64360-eth-block"
116 - reg : Offset and length of the register set for this block
117
118 Example Discovery Ethernet block node:
119 ethernet-block@2000 {
120 #address-cells = <1>;
121 #size-cells = <0>;
122 compatible = "marvell,mv64360-eth-block";
123 reg = <0x2000 0x2000>;
124 ethernet@0 {
125 .......
126 };
127 };
128
129 Ethernet port node
130
131 Required properties:
132 - device_type : Should be "network".
133 - compatible : Should be "marvell,mv64360-eth".
134 - reg : Should be <0>, <1>, or <2>, according to which registers
135 within the silicon block the device uses.
136 - interrupts : <a> where a is the interrupt number for the port.
137 - interrupt-parent : the phandle for the interrupt controller
138 that services interrupts for this device.
139 - phy : the phandle for the PHY connected to this ethernet
140 controller.
141 - local-mac-address : 6 bytes, MAC address
142
143 Example Discovery Ethernet port node:
144 ethernet@0 {
145 device_type = "network";
146 compatible = "marvell,mv64360-eth";
147 reg = <0>;
148 interrupts = <32>;
149 interrupt-parent = <&PIC>;
150 phy = <&PHY0>;
151 local-mac-address = [ 00 00 00 00 00 00 ];
152 };
153
154
155
156 c) Marvell Discovery PHY nodes
157
158 Required properties:
159 - device_type : Should be "ethernet-phy"
160 - interrupts : <a> where a is the interrupt number for this phy.
161 - interrupt-parent : the phandle for the interrupt controller that
162 services interrupts for this device.
163 - reg : The ID number for the phy, usually a small integer
164
165 Example Discovery PHY node:
166 ethernet-phy@1 {
167 device_type = "ethernet-phy";
168 compatible = "broadcom,bcm5421";
169 interrupts = <76>; /* GPP 12 */
170 interrupt-parent = <&PIC>;
171 reg = <1>;
172 };
173
174
175 d) Marvell Discovery SDMA nodes
176
177 Represent DMA hardware associated with the MPSC (multiprotocol
178 serial controllers).
179
180 Required properties:
181 - compatible : "marvell,mv64360-sdma"
182 - reg : Offset and length of the register set for this device
183 - interrupts : <a> where a is the interrupt number for the DMA
184 device.
185 - interrupt-parent : the phandle for the interrupt controller
186 that services interrupts for this device.
187
188 Example Discovery SDMA node:
189 sdma@4000 {
190 compatible = "marvell,mv64360-sdma";
191 reg = <0x4000 0xc18>;
192 virtual-reg = <0xf1004000>;
193 interrupts = <36>;
194 interrupt-parent = <&PIC>;
195 };
196
197
198 e) Marvell Discovery BRG nodes
199
200 Represent baud rate generator hardware associated with the MPSC
201 (multiprotocol serial controllers).
202
203 Required properties:
204 - compatible : "marvell,mv64360-brg"
205 - reg : Offset and length of the register set for this device
206 - clock-src : A value from 0 to 15 which selects the clock
207 source for the baud rate generator. This value corresponds
208 to the CLKS value in the BRGx configuration register. See
209 the mv64x60 User's Manual.
210 - clock-frequence : The frequency (in Hz) of the baud rate
211 generator's input clock.
212 - current-speed : The current speed setting (presumably by
213 firmware) of the baud rate generator.
214
215 Example Discovery BRG node:
216 brg@b200 {
217 compatible = "marvell,mv64360-brg";
218 reg = <0xb200 0x8>;
219 clock-src = <8>;
220 clock-frequency = <133333333>;
221 current-speed = <9600>;
222 };
223
224
225 f) Marvell Discovery CUNIT nodes
226
227 Represent the Serial Communications Unit device hardware.
228
229 Required properties:
230 - reg : Offset and length of the register set for this device
231
232 Example Discovery CUNIT node:
233 cunit@f200 {
234 reg = <0xf200 0x200>;
235 };
236
237
238 g) Marvell Discovery MPSCROUTING nodes
239
240 Represent the Discovery's MPSC routing hardware
241
242 Required properties:
243 - reg : Offset and length of the register set for this device
244
245 Example Discovery CUNIT node:
246 mpscrouting@b500 {
247 reg = <0xb400 0xc>;
248 };
249
250
251 h) Marvell Discovery MPSCINTR nodes
252
253 Represent the Discovery's MPSC DMA interrupt hardware registers
254 (SDMA cause and mask registers).
255
256 Required properties:
257 - reg : Offset and length of the register set for this device
258
259 Example Discovery MPSCINTR node:
260 mpsintr@b800 {
261 reg = <0xb800 0x100>;
262 };
263
264
265 i) Marvell Discovery MPSC nodes
266
267 Represent the Discovery's MPSC (Multiprotocol Serial Controller)
268 serial port.
269
270 Required properties:
271 - device_type : "serial"
272 - compatible : "marvell,mv64360-mpsc"
273 - reg : Offset and length of the register set for this device
274 - sdma : the phandle for the SDMA node used by this port
275 - brg : the phandle for the BRG node used by this port
276 - cunit : the phandle for the CUNIT node used by this port
277 - mpscrouting : the phandle for the MPSCROUTING node used by this port
278 - mpscintr : the phandle for the MPSCINTR node used by this port
279 - cell-index : the hardware index of this cell in the MPSC core
280 - max_idle : value needed for MPSC CHR3 (Maximum Frame Length)
281 register
282 - interrupts : <a> where a is the interrupt number for the MPSC.
283 - interrupt-parent : the phandle for the interrupt controller
284 that services interrupts for this device.
285
286 Example Discovery MPSCINTR node:
287 mpsc@8000 {
288 device_type = "serial";
289 compatible = "marvell,mv64360-mpsc";
290 reg = <0x8000 0x38>;
291 virtual-reg = <0xf1008000>;
292 sdma = <&SDMA0>;
293 brg = <&BRG0>;
294 cunit = <&CUNIT>;
295 mpscrouting = <&MPSCROUTING>;
296 mpscintr = <&MPSCINTR>;
297 cell-index = <0>;
298 max_idle = <40>;
299 interrupts = <40>;
300 interrupt-parent = <&PIC>;
301 };
302
303
304 j) Marvell Discovery Watch Dog Timer nodes
305
306 Represent the Discovery's watchdog timer hardware
307
308 Required properties:
309 - compatible : "marvell,mv64360-wdt"
310 - reg : Offset and length of the register set for this device
311
312 Example Discovery Watch Dog Timer node:
313 wdt@b410 {
314 compatible = "marvell,mv64360-wdt";
315 reg = <0xb410 0x8>;
316 };
317
318
319 k) Marvell Discovery I2C nodes
320
321 Represent the Discovery's I2C hardware
322
323 Required properties:
324 - device_type : "i2c"
325 - compatible : "marvell,mv64360-i2c"
326 - reg : Offset and length of the register set for this device
327 - interrupts : <a> where a is the interrupt number for the I2C.
328 - interrupt-parent : the phandle for the interrupt controller
329 that services interrupts for this device.
330
331 Example Discovery I2C node:
332 compatible = "marvell,mv64360-i2c";
333 reg = <0xc000 0x20>;
334 virtual-reg = <0xf100c000>;
335 interrupts = <37>;
336 interrupt-parent = <&PIC>;
337 };
338
339
340 l) Marvell Discovery PIC (Programmable Interrupt Controller) nodes
341
342 Represent the Discovery's PIC hardware
343
344 Required properties:
345 - #interrupt-cells : <1>
346 - #address-cells : <0>
347 - compatible : "marvell,mv64360-pic"
348 - reg : Offset and length of the register set for this device
349 - interrupt-controller
350
351 Example Discovery PIC node:
352 pic {
353 #interrupt-cells = <1>;
354 #address-cells = <0>;
355 compatible = "marvell,mv64360-pic";
356 reg = <0x0 0x88>;
357 interrupt-controller;
358 };
359
360
361 m) Marvell Discovery MPP (Multipurpose Pins) multiplexing nodes
362
363 Represent the Discovery's MPP hardware
364
365 Required properties:
366 - compatible : "marvell,mv64360-mpp"
367 - reg : Offset and length of the register set for this device
368
369 Example Discovery MPP node:
370 mpp@f000 {
371 compatible = "marvell,mv64360-mpp";
372 reg = <0xf000 0x10>;
373 };
374
375
376 n) Marvell Discovery GPP (General Purpose Pins) nodes
377
378 Represent the Discovery's GPP hardware
379
380 Required properties:
381 - compatible : "marvell,mv64360-gpp"
382 - reg : Offset and length of the register set for this device
383
384 Example Discovery GPP node:
385 gpp@f000 {
386 compatible = "marvell,mv64360-gpp";
387 reg = <0xf100 0x20>;
388 };
389
390
391 o) Marvell Discovery PCI host bridge node
392
393 Represents the Discovery's PCI host bridge device. The properties
394 for this node conform to Rev 2.1 of the PCI Bus Binding to IEEE
395 1275-1994. A typical value for the compatible property is
396 "marvell,mv64360-pci".
397
398 Example Discovery PCI host bridge node
399 pci@80000000 {
400 #address-cells = <3>;
401 #size-cells = <2>;
402 #interrupt-cells = <1>;
403 device_type = "pci";
404 compatible = "marvell,mv64360-pci";
405 reg = <0xcf8 0x8>;
406 ranges = <0x01000000 0x0 0x0
407 0x88000000 0x0 0x01000000
408 0x02000000 0x0 0x80000000
409 0x80000000 0x0 0x08000000>;
410 bus-range = <0 255>;
411 clock-frequency = <66000000>;
412 interrupt-parent = <&PIC>;
413 interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
414 interrupt-map = <
415 /* IDSEL 0x0a */
416 0x5000 0 0 1 &PIC 80
417 0x5000 0 0 2 &PIC 81
418 0x5000 0 0 3 &PIC 91
419 0x5000 0 0 4 &PIC 93
420
421 /* IDSEL 0x0b */
422 0x5800 0 0 1 &PIC 91
423 0x5800 0 0 2 &PIC 93
424 0x5800 0 0 3 &PIC 80
425 0x5800 0 0 4 &PIC 81
426
427 /* IDSEL 0x0c */
428 0x6000 0 0 1 &PIC 91
429 0x6000 0 0 2 &PIC 93
430 0x6000 0 0 3 &PIC 80
431 0x6000 0 0 4 &PIC 81
432
433 /* IDSEL 0x0d */
434 0x6800 0 0 1 &PIC 93
435 0x6800 0 0 2 &PIC 80
436 0x6800 0 0 3 &PIC 81
437 0x6800 0 0 4 &PIC 91
438 >;
439 };
440
441
442 p) Marvell Discovery CPU Error nodes
443
444 Represent the Discovery's CPU error handler device.
445
446 Required properties:
447 - compatible : "marvell,mv64360-cpu-error"
448 - reg : Offset and length of the register set for this device
449 - interrupts : the interrupt number for this device
450 - interrupt-parent : the phandle for the interrupt controller
451 that services interrupts for this device.
452
453 Example Discovery CPU Error node:
454 cpu-error@0070 {
455 compatible = "marvell,mv64360-cpu-error";
456 reg = <0x70 0x10 0x128 0x28>;
457 interrupts = <3>;
458 interrupt-parent = <&PIC>;
459 };
460
461
462 q) Marvell Discovery SRAM Controller nodes
463
464 Represent the Discovery's SRAM controller device.
465
466 Required properties:
467 - compatible : "marvell,mv64360-sram-ctrl"
468 - reg : Offset and length of the register set for this device
469 - interrupts : the interrupt number for this device
470 - interrupt-parent : the phandle for the interrupt controller
471 that services interrupts for this device.
472
473 Example Discovery SRAM Controller node:
474 sram-ctrl@0380 {
475 compatible = "marvell,mv64360-sram-ctrl";
476 reg = <0x380 0x80>;
477 interrupts = <13>;
478 interrupt-parent = <&PIC>;
479 };
480
481
482 r) Marvell Discovery PCI Error Handler nodes
483
484 Represent the Discovery's PCI error handler device.
485
486 Required properties:
487 - compatible : "marvell,mv64360-pci-error"
488 - reg : Offset and length of the register set for this device
489 - interrupts : the interrupt number for this device
490 - interrupt-parent : the phandle for the interrupt controller
491 that services interrupts for this device.
492
493 Example Discovery PCI Error Handler node:
494 pci-error@1d40 {
495 compatible = "marvell,mv64360-pci-error";
496 reg = <0x1d40 0x40 0xc28 0x4>;
497 interrupts = <12>;
498 interrupt-parent = <&PIC>;
499 };
500
501
502 s) Marvell Discovery Memory Controller nodes
503
504 Represent the Discovery's memory controller device.
505
506 Required properties:
507 - compatible : "marvell,mv64360-mem-ctrl"
508 - reg : Offset and length of the register set for this device
509 - interrupts : the interrupt number for this device
510 - interrupt-parent : the phandle for the interrupt controller
511 that services interrupts for this device.
512
513 Example Discovery Memory Controller node:
514 mem-ctrl@1400 {
515 compatible = "marvell,mv64360-mem-ctrl";
516 reg = <0x1400 0x60>;
517 interrupts = <17>;
518 interrupt-parent = <&PIC>;
519 };
520
521
diff --git a/Documentation/powerpc/dts-bindings/phy.txt b/Documentation/powerpc/dts-bindings/phy.txt
new file mode 100644
index 000000000000..bb8c742eb8c5
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/phy.txt
@@ -0,0 +1,25 @@
1PHY nodes
2
3Required properties:
4
5 - device_type : Should be "ethernet-phy"
6 - interrupts : <a b> where a is the interrupt number and b is a
7 field that represents an encoding of the sense and level
8 information for the interrupt. This should be encoded based on
9 the information in section 2) depending on the type of interrupt
10 controller you have.
11 - interrupt-parent : the phandle for the interrupt controller that
12 services interrupts for this device.
13 - reg : The ID number for the phy, usually a small integer
14 - linux,phandle : phandle for this node; likely referenced by an
15 ethernet controller node.
16
17Example:
18
19ethernet-phy@0 {
20 linux,phandle = <2452000>
21 interrupt-parent = <40000>;
22 interrupts = <35 1>;
23 reg = <0>;
24 device_type = "ethernet-phy";
25};
diff --git a/Documentation/powerpc/dts-bindings/spi-bus.txt b/Documentation/powerpc/dts-bindings/spi-bus.txt
new file mode 100644
index 000000000000..e782add2e457
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/spi-bus.txt
@@ -0,0 +1,57 @@
1SPI (Serial Peripheral Interface) busses
2
3SPI busses can be described with a node for the SPI master device
4and a set of child nodes for each SPI slave on the bus. For this
5discussion, it is assumed that the system's SPI controller is in
6SPI master mode. This binding does not describe SPI controllers
7in slave mode.
8
9The SPI master node requires the following properties:
10- #address-cells - number of cells required to define a chip select
11 address on the SPI bus.
12- #size-cells - should be zero.
13- compatible - name of SPI bus controller following generic names
14 recommended practice.
15No other properties are required in the SPI bus node. It is assumed
16that a driver for an SPI bus device will understand that it is an SPI bus.
17However, the binding does not attempt to define the specific method for
18assigning chip select numbers. Since SPI chip select configuration is
19flexible and non-standardized, it is left out of this binding with the
20assumption that board specific platform code will be used to manage
21chip selects. Individual drivers can define additional properties to
22support describing the chip select layout.
23
24SPI slave nodes must be children of the SPI master node and can
25contain the following properties.
26- reg - (required) chip select address of device.
27- compatible - (required) name of SPI device following generic names
28 recommended practice
29- spi-max-frequency - (required) Maximum SPI clocking speed of device in Hz
30- spi-cpol - (optional) Empty property indicating device requires
31 inverse clock polarity (CPOL) mode
32- spi-cpha - (optional) Empty property indicating device requires
33 shifted clock phase (CPHA) mode
34- spi-cs-high - (optional) Empty property indicating device requires
35 chip select active high
36
37SPI example for an MPC5200 SPI bus:
38 spi@f00 {
39 #address-cells = <1>;
40 #size-cells = <0>;
41 compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
42 reg = <0xf00 0x20>;
43 interrupts = <2 13 0 2 14 0>;
44 interrupt-parent = <&mpc5200_pic>;
45
46 ethernet-switch@0 {
47 compatible = "micrel,ks8995m";
48 spi-max-frequency = <1000000>;
49 reg = <0>;
50 };
51
52 codec@1 {
53 compatible = "ti,tlv320aic26";
54 spi-max-frequency = <100000>;
55 reg = <1>;
56 };
57 };
diff --git a/Documentation/powerpc/dts-bindings/usb-ehci.txt b/Documentation/powerpc/dts-bindings/usb-ehci.txt
new file mode 100644
index 000000000000..fa18612f757b
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/usb-ehci.txt
@@ -0,0 +1,25 @@
1USB EHCI controllers
2
3Required properties:
4 - compatible : should be "usb-ehci".
5 - reg : should contain at least address and length of the standard EHCI
6 register set for the device. Optional platform-dependent registers
7 (debug-port or other) can be also specified here, but only after
8 definition of standard EHCI registers.
9 - interrupts : one EHCI interrupt should be described here.
10If device registers are implemented in big endian mode, the device
11node should have "big-endian-regs" property.
12If controller implementation operates with big endian descriptors,
13"big-endian-desc" property should be specified.
14If both big endian registers and descriptors are used by the controller
15implementation, "big-endian" property can be specified instead of having
16both "big-endian-regs" and "big-endian-desc".
17
18Example (Sequoia 440EPx):
19 ehci@e0000300 {
20 compatible = "ibm,usb-ehci-440epx", "usb-ehci";
21 interrupt-parent = <&UIC0>;
22 interrupts = <1a 4>;
23 reg = <0 e0000300 90 0 e0000390 70>;
24 big-endian;
25 };
diff --git a/Documentation/powerpc/dts-bindings/xilinx.txt b/Documentation/powerpc/dts-bindings/xilinx.txt
new file mode 100644
index 000000000000..80339fe4300b
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/xilinx.txt
@@ -0,0 +1,295 @@
1 d) Xilinx IP cores
2
3 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
4 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
5 of standard device types (network, serial, etc.) and miscellaneous
6 devices (gpio, LCD, spi, etc). Also, since these devices are
7 implemented within the fpga fabric every instance of the device can be
8 synthesised with different options that change the behaviour.
9
10 Each IP-core has a set of parameters which the FPGA designer can use to
11 control how the core is synthesized. Historically, the EDK tool would
12 extract the device parameters relevant to device drivers and copy them
13 into an 'xparameters.h' in the form of #define symbols. This tells the
14 device drivers how the IP cores are configured, but it requres the kernel
15 to be recompiled every time the FPGA bitstream is resynthesized.
16
17 The new approach is to export the parameters into the device tree and
18 generate a new device tree each time the FPGA bitstream changes. The
19 parameters which used to be exported as #defines will now become
20 properties of the device node. In general, device nodes for IP-cores
21 will take the following form:
22
23 (name): (generic-name)@(base-address) {
24 compatible = "xlnx,(ip-core-name)-(HW_VER)"
25 [, (list of compatible devices), ...];
26 reg = <(baseaddr) (size)>;
27 interrupt-parent = <&interrupt-controller-phandle>;
28 interrupts = < ... >;
29 xlnx,(parameter1) = "(string-value)";
30 xlnx,(parameter2) = <(int-value)>;
31 };
32
33 (generic-name): an open firmware-style name that describes the
34 generic class of device. Preferably, this is one word, such
35 as 'serial' or 'ethernet'.
36 (ip-core-name): the name of the ip block (given after the BEGIN
37 directive in system.mhs). Should be in lowercase
38 and all underscores '_' converted to dashes '-'.
39 (name): is derived from the "PARAMETER INSTANCE" value.
40 (parameter#): C_* parameters from system.mhs. The C_ prefix is
41 dropped from the parameter name, the name is converted
42 to lowercase and all underscore '_' characters are
43 converted to dashes '-'.
44 (baseaddr): the baseaddr parameter value (often named C_BASEADDR).
45 (HW_VER): from the HW_VER parameter.
46 (size): the address range size (often C_HIGHADDR - C_BASEADDR + 1).
47
48 Typically, the compatible list will include the exact IP core version
49 followed by an older IP core version which implements the same
50 interface or any other device with the same interface.
51
52 'reg', 'interrupt-parent' and 'interrupts' are all optional properties.
53
54 For example, the following block from system.mhs:
55
56 BEGIN opb_uartlite
57 PARAMETER INSTANCE = opb_uartlite_0
58 PARAMETER HW_VER = 1.00.b
59 PARAMETER C_BAUDRATE = 115200
60 PARAMETER C_DATA_BITS = 8
61 PARAMETER C_ODD_PARITY = 0
62 PARAMETER C_USE_PARITY = 0
63 PARAMETER C_CLK_FREQ = 50000000
64 PARAMETER C_BASEADDR = 0xEC100000
65 PARAMETER C_HIGHADDR = 0xEC10FFFF
66 BUS_INTERFACE SOPB = opb_7
67 PORT OPB_Clk = CLK_50MHz
68 PORT Interrupt = opb_uartlite_0_Interrupt
69 PORT RX = opb_uartlite_0_RX
70 PORT TX = opb_uartlite_0_TX
71 PORT OPB_Rst = sys_bus_reset_0
72 END
73
74 becomes the following device tree node:
75
76 opb_uartlite_0: serial@ec100000 {
77 device_type = "serial";
78 compatible = "xlnx,opb-uartlite-1.00.b";
79 reg = <ec100000 10000>;
80 interrupt-parent = <&opb_intc_0>;
81 interrupts = <1 0>; // got this from the opb_intc parameters
82 current-speed = <d#115200>; // standard serial device prop
83 clock-frequency = <d#50000000>; // standard serial device prop
84 xlnx,data-bits = <8>;
85 xlnx,odd-parity = <0>;
86 xlnx,use-parity = <0>;
87 };
88
89 Some IP cores actually implement 2 or more logical devices. In
90 this case, the device should still describe the whole IP core with
91 a single node and add a child node for each logical device. The
92 ranges property can be used to translate from parent IP-core to the
93 registers of each device. In addition, the parent node should be
94 compatible with the bus type 'xlnx,compound', and should contain
95 #address-cells and #size-cells, as with any other bus. (Note: this
96 makes the assumption that both logical devices have the same bus
97 binding. If this is not true, then separate nodes should be used
98 for each logical device). The 'cell-index' property can be used to
99 enumerate logical devices within an IP core. For example, the
100 following is the system.mhs entry for the dual ps2 controller found
101 on the ml403 reference design.
102
103 BEGIN opb_ps2_dual_ref
104 PARAMETER INSTANCE = opb_ps2_dual_ref_0
105 PARAMETER HW_VER = 1.00.a
106 PARAMETER C_BASEADDR = 0xA9000000
107 PARAMETER C_HIGHADDR = 0xA9001FFF
108 BUS_INTERFACE SOPB = opb_v20_0
109 PORT Sys_Intr1 = ps2_1_intr
110 PORT Sys_Intr2 = ps2_2_intr
111 PORT Clkin1 = ps2_clk_rx_1
112 PORT Clkin2 = ps2_clk_rx_2
113 PORT Clkpd1 = ps2_clk_tx_1
114 PORT Clkpd2 = ps2_clk_tx_2
115 PORT Rx1 = ps2_d_rx_1
116 PORT Rx2 = ps2_d_rx_2
117 PORT Txpd1 = ps2_d_tx_1
118 PORT Txpd2 = ps2_d_tx_2
119 END
120
121 It would result in the following device tree nodes:
122
123 opb_ps2_dual_ref_0: opb-ps2-dual-ref@a9000000 {
124 #address-cells = <1>;
125 #size-cells = <1>;
126 compatible = "xlnx,compound";
127 ranges = <0 a9000000 2000>;
128 // If this device had extra parameters, then they would
129 // go here.
130 ps2@0 {
131 compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
132 reg = <0 40>;
133 interrupt-parent = <&opb_intc_0>;
134 interrupts = <3 0>;
135 cell-index = <0>;
136 };
137 ps2@1000 {
138 compatible = "xlnx,opb-ps2-dual-ref-1.00.a";
139 reg = <1000 40>;
140 interrupt-parent = <&opb_intc_0>;
141 interrupts = <3 0>;
142 cell-index = <0>;
143 };
144 };
145
146 Also, the system.mhs file defines bus attachments from the processor
147 to the devices. The device tree structure should reflect the bus
148 attachments. Again an example; this system.mhs fragment:
149
150 BEGIN ppc405_virtex4
151 PARAMETER INSTANCE = ppc405_0
152 PARAMETER HW_VER = 1.01.a
153 BUS_INTERFACE DPLB = plb_v34_0
154 BUS_INTERFACE IPLB = plb_v34_0
155 END
156
157 BEGIN opb_intc
158 PARAMETER INSTANCE = opb_intc_0
159 PARAMETER HW_VER = 1.00.c
160 PARAMETER C_BASEADDR = 0xD1000FC0
161 PARAMETER C_HIGHADDR = 0xD1000FDF
162 BUS_INTERFACE SOPB = opb_v20_0
163 END
164
165 BEGIN opb_uart16550
166 PARAMETER INSTANCE = opb_uart16550_0
167 PARAMETER HW_VER = 1.00.d
168 PARAMETER C_BASEADDR = 0xa0000000
169 PARAMETER C_HIGHADDR = 0xa0001FFF
170 BUS_INTERFACE SOPB = opb_v20_0
171 END
172
173 BEGIN plb_v34
174 PARAMETER INSTANCE = plb_v34_0
175 PARAMETER HW_VER = 1.02.a
176 END
177
178 BEGIN plb_bram_if_cntlr
179 PARAMETER INSTANCE = plb_bram_if_cntlr_0
180 PARAMETER HW_VER = 1.00.b
181 PARAMETER C_BASEADDR = 0xFFFF0000
182 PARAMETER C_HIGHADDR = 0xFFFFFFFF
183 BUS_INTERFACE SPLB = plb_v34_0
184 END
185
186 BEGIN plb2opb_bridge
187 PARAMETER INSTANCE = plb2opb_bridge_0
188 PARAMETER HW_VER = 1.01.a
189 PARAMETER C_RNG0_BASEADDR = 0x20000000
190 PARAMETER C_RNG0_HIGHADDR = 0x3FFFFFFF
191 PARAMETER C_RNG1_BASEADDR = 0x60000000
192 PARAMETER C_RNG1_HIGHADDR = 0x7FFFFFFF
193 PARAMETER C_RNG2_BASEADDR = 0x80000000
194 PARAMETER C_RNG2_HIGHADDR = 0xBFFFFFFF
195 PARAMETER C_RNG3_BASEADDR = 0xC0000000
196 PARAMETER C_RNG3_HIGHADDR = 0xDFFFFFFF
197 BUS_INTERFACE SPLB = plb_v34_0
198 BUS_INTERFACE MOPB = opb_v20_0
199 END
200
201 Gives this device tree (some properties removed for clarity):
202
203 plb@0 {
204 #address-cells = <1>;
205 #size-cells = <1>;
206 compatible = "xlnx,plb-v34-1.02.a";
207 device_type = "ibm,plb";
208 ranges; // 1:1 translation
209
210 plb_bram_if_cntrl_0: bram@ffff0000 {
211 reg = <ffff0000 10000>;
212 }
213
214 opb@20000000 {
215 #address-cells = <1>;
216 #size-cells = <1>;
217 ranges = <20000000 20000000 20000000
218 60000000 60000000 20000000
219 80000000 80000000 40000000
220 c0000000 c0000000 20000000>;
221
222 opb_uart16550_0: serial@a0000000 {
223 reg = <a00000000 2000>;
224 };
225
226 opb_intc_0: interrupt-controller@d1000fc0 {
227 reg = <d1000fc0 20>;
228 };
229 };
230 };
231
232 That covers the general approach to binding xilinx IP cores into the
233 device tree. The following are bindings for specific devices:
234
235 i) Xilinx ML300 Framebuffer
236
237 Simple framebuffer device from the ML300 reference design (also on the
238 ML403 reference design as well as others).
239
240 Optional properties:
241 - resolution = <xres yres> : pixel resolution of framebuffer. Some
242 implementations use a different resolution.
243 Default is <d#640 d#480>
244 - virt-resolution = <xvirt yvirt> : Size of framebuffer in memory.
245 Default is <d#1024 d#480>.
246 - rotate-display (empty) : rotate display 180 degrees.
247
248 ii) Xilinx SystemACE
249
250 The Xilinx SystemACE device is used to program FPGAs from an FPGA
251 bitstream stored on a CF card. It can also be used as a generic CF
252 interface device.
253
254 Optional properties:
255 - 8-bit (empty) : Set this property for SystemACE in 8 bit mode
256
257 iii) Xilinx EMAC and Xilinx TEMAC
258
259 Xilinx Ethernet devices. In addition to general xilinx properties
260 listed above, nodes for these devices should include a phy-handle
261 property, and may include other common network device properties
262 like local-mac-address.
263
264 iv) Xilinx Uartlite
265
266 Xilinx uartlite devices are simple fixed speed serial ports.
267
268 Required properties:
269 - current-speed : Baud rate of uartlite
270
271 v) Xilinx hwicap
272
273 Xilinx hwicap devices provide access to the configuration logic
274 of the FPGA through the Internal Configuration Access Port
275 (ICAP). The ICAP enables partial reconfiguration of the FPGA,
276 readback of the configuration information, and some control over
277 'warm boots' of the FPGA fabric.
278
279 Required properties:
280 - xlnx,family : The family of the FPGA, necessary since the
281 capabilities of the underlying ICAP hardware
282 differ between different families. May be
283 'virtex2p', 'virtex4', or 'virtex5'.
284
285 vi) Xilinx Uart 16550
286
287 Xilinx UART 16550 devices are very similar to the NS16550 but with
288 different register spacing and an offset from the base address.
289
290 Required properties:
291 - clock-frequency : Frequency of the clock input
292 - reg-offset : A value of 3 is required
293 - reg-shift : A value of 2 is required
294
295
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
new file mode 100644
index 000000000000..125f4ab48998
--- /dev/null
+++ b/Documentation/pps/pps.txt
@@ -0,0 +1,172 @@
1
2 PPS - Pulse Per Second
3 ----------------------
4
5(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
6
7This program is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2 of the License, or
10(at your option) any later version.
11
12This program is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17
18
19Overview
20--------
21
22LinuxPPS provides a programming interface (API) to define in the
23system several PPS sources.
24
25PPS means "pulse per second" and a PPS source is just a device which
26provides a high precision signal each second so that an application
27can use it to adjust system clock time.
28
29A PPS source can be connected to a serial port (usually to the Data
30Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
31CPU's GPIOs (this is the common case in embedded systems) but in each
32case when a new pulse arrives the system must apply to it a timestamp
33and record it for userland.
34
35Common use is the combination of the NTPD as userland program, with a
36GPS receiver as PPS source, to obtain a wallclock-time with
37sub-millisecond synchronisation to UTC.
38
39
40RFC considerations
41------------------
42
43While implementing a PPS API as RFC 2783 defines and using an embedded
44CPU GPIO-Pin as physical link to the signal, I encountered a deeper
45problem:
46
47 At startup it needs a file descriptor as argument for the function
48 time_pps_create().
49
50This implies that the source has a /dev/... entry. This assumption is
51ok for the serial and parallel port, where you can do something
52useful besides(!) the gathering of timestamps as it is the central
53task for a PPS-API. But this assumption does not work for a single
54purpose GPIO line. In this case even basic file-related functionality
55(like read() and write()) makes no sense at all and should not be a
56precondition for the use of a PPS-API.
57
58The problem can be simply solved if you consider that a PPS source is
59not always connected with a GPS data source.
60
61So your programs should check if the GPS data source (the serial port
62for instance) is a PPS source too, and if not they should provide the
63possibility to open another device as PPS source.
64
65In LinuxPPS the PPS sources are simply char devices usually mapped
66into files /dev/pps0, /dev/pps1, etc..
67
68
69Coding example
70--------------
71
72To register a PPS source into the kernel you should define a struct
73pps_source_info_s as follows:
74
75 static struct pps_source_info pps_ktimer_info = {
76 .name = "ktimer",
77 .path = "",
78 .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
79 PPS_ECHOASSERT | \
80 PPS_CANWAIT | PPS_TSFMT_TSPEC,
81 .echo = pps_ktimer_echo,
82 .owner = THIS_MODULE,
83 };
84
85and then calling the function pps_register_source() in your
86intialization routine as follows:
87
88 source = pps_register_source(&pps_ktimer_info,
89 PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
90
91The pps_register_source() prototype is:
92
93 int pps_register_source(struct pps_source_info_s *info, int default_params)
94
95where "info" is a pointer to a structure that describes a particular
96PPS source, "default_params" tells the system what the initial default
97parameters for the device should be (it is obvious that these parameters
98must be a subset of ones defined in the struct
99pps_source_info_s which describe the capabilities of the driver).
100
101Once you have registered a new PPS source into the system you can
102signal an assert event (for example in the interrupt handler routine)
103just using:
104
105 pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
106
107where "ts" is the event's timestamp.
108
109The same function may also run the defined echo function
110(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
111asked for that... etc..
112
113Please see the file drivers/pps/clients/ktimer.c for example code.
114
115
116SYSFS support
117-------------
118
119If the SYSFS filesystem is enabled in the kernel it provides a new class:
120
121 $ ls /sys/class/pps/
122 pps0/ pps1/ pps2/
123
124Every directory is the ID of a PPS sources defined in the system and
125inside you find several files:
126
127 $ ls /sys/class/pps/pps0/
128 assert clear echo mode name path subsystem@ uevent
129
130Inside each "assert" and "clear" file you can find the timestamp and a
131sequence number:
132
133 $ cat /sys/class/pps/pps0/assert
134 1170026870.983207967#8
135
136Where before the "#" is the timestamp in seconds; after it is the
137sequence number. Other files are:
138
139* echo: reports if the PPS source has an echo function or not;
140
141* mode: reports available PPS functioning modes;
142
143* name: reports the PPS source's name;
144
145* path: reports the PPS source's device path, that is the device the
146 PPS source is connected to (if it exists).
147
148
149Testing the PPS support
150-----------------------
151
152In order to test the PPS support even without specific hardware you can use
153the ktimer driver (see the client subsection in the PPS configuration menu)
154and the userland tools provided into Documentaion/pps/ directory.
155
156Once you have enabled the compilation of ktimer just modprobe it (if
157not statically compiled):
158
159 # modprobe ktimer
160
161and the run ppstest as follow:
162
163 $ ./ppstest /dev/pps0
164 trying PPS source "/dev/pps1"
165 found PPS source "/dev/pps1"
166 ok, found 1 source(s), now start fetching data...
167 source 0 - assert 1186592699.388832443, sequence: 364 - clear 0.000000000, sequence: 0
168 source 0 - assert 1186592700.388931295, sequence: 365 - clear 0.000000000, sequence: 0
169 source 0 - assert 1186592701.389032765, sequence: 366 - clear 0.000000000, sequence: 0
170
171Please, note that to compile userland programs you need the file timepps.h
172(see Documentation/pps/).
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 1b74b5f30af4..b4860509c319 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -3,9 +3,8 @@ rfkill - RF kill switch support
3 3
41. Introduction 41. Introduction
52. Implementation details 52. Implementation details
63. Kernel driver guidelines 63. Kernel API
74. Kernel API 74. Userspace support
85. Userspace support
9 8
10 9
111. Introduction 101. Introduction
@@ -19,82 +18,62 @@ disable all transmitters of a certain type (or all). This is intended for
19situations where transmitters need to be turned off, for example on 18situations where transmitters need to be turned off, for example on
20aircraft. 19aircraft.
21 20
21The rfkill subsystem has a concept of "hard" and "soft" block, which
22differ little in their meaning (block == transmitters off) but rather in
23whether they can be changed or not:
24 - hard block: read-only radio block that cannot be overriden by software
25 - soft block: writable radio block (need not be readable) that is set by
26 the system software.
22 27
23 28
242. Implementation details 292. Implementation details
25 30
26The rfkill subsystem is composed of various components: the rfkill class, the 31The rfkill subsystem is composed of three main components:
27rfkill-input module (an input layer handler), and some specific input layer 32 * the rfkill core,
28events. 33 * the deprecated rfkill-input module (an input layer handler, being
29 34 replaced by userspace policy code) and
30The rfkill class is provided for kernel drivers to register their radio 35 * the rfkill drivers.
31transmitter with the kernel, provide methods for turning it on and off and,
32optionally, letting the system know about hardware-disabled states that may
33be implemented on the device. This code is enabled with the CONFIG_RFKILL
34Kconfig option, which drivers can "select".
35
36The rfkill class code also notifies userspace of state changes, this is
37achieved via uevents. It also provides some sysfs files for userspace to
38check the status of radio transmitters. See the "Userspace support" section
39below.
40 36
37The rfkill core provides API for kernel drivers to register their radio
38transmitter with the kernel, methods for turning it on and off and, letting
39the system know about hardware-disabled states that may be implemented on
40the device.
41 41
42The rfkill-input code implements a basic response to rfkill buttons -- it 42The rfkill core code also notifies userspace of state changes, and provides
43implements turning on/off all devices of a certain class (or all). 43ways for userspace to query the current states. See the "Userspace support"
44section below.
44 45
45When the device is hard-blocked (either by a call to rfkill_set_hw_state() 46When the device is hard-blocked (either by a call to rfkill_set_hw_state()
46or from query_hw_block) set_block() will be invoked but drivers can well 47or from query_hw_block) set_block() will be invoked for additional software
47ignore the method call since they can use the return value of the function 48block, but drivers can ignore the method call since they can use the return
48rfkill_set_hw_state() to sync the software state instead of keeping track 49value of the function rfkill_set_hw_state() to sync the software state
49of calls to set_block(). 50instead of keeping track of calls to set_block(). In fact, drivers should
50 51use the return value of rfkill_set_hw_state() unless the hardware actually
51 52keeps track of soft and hard block separately.
52The entire functionality is spread over more than one subsystem:
53
54 * The kernel input layer generates KEY_WWAN, KEY_WLAN etc. and
55 SW_RFKILL_ALL -- when the user presses a button. Drivers for radio
56 transmitters generally do not register to the input layer, unless the
57 device really provides an input device (i.e. a button that has no
58 effect other than generating a button press event)
59
60 * The rfkill-input code hooks up to these events and switches the soft-block
61 of the various radio transmitters, depending on the button type.
62
63 * The rfkill drivers turn off/on their transmitters as requested.
64
65 * The rfkill class will generate userspace notifications (uevents) to tell
66 userspace what the current state is.
67 53
68 54
553. Kernel API
69 56
703. Kernel driver guidelines
71 57
72 58Drivers for radio transmitters normally implement an rfkill driver.
73Drivers for radio transmitters normally implement only the rfkill class.
74These drivers may not unblock the transmitter based on own decisions, they
75should act on information provided by the rfkill class only.
76 59
77Platform drivers might implement input devices if the rfkill button is just 60Platform drivers might implement input devices if the rfkill button is just
78that, a button. If that button influences the hardware then you need to 61that, a button. If that button influences the hardware then you need to
79implement an rfkill class instead. This also applies if the platform provides 62implement an rfkill driver instead. This also applies if the platform provides
80a way to turn on/off the transmitter(s). 63a way to turn on/off the transmitter(s).
81 64
82During suspend/hibernation, transmitters should only be left enabled when 65For some platforms, it is possible that the hardware state changes during
83wake-on wlan or similar functionality requires it and the device wasn't 66suspend/hibernation, in which case it will be necessary to update the rfkill
84blocked before suspend/hibernate. Note that it may be necessary to update 67core with the current state is at resume time.
85the rfkill subsystem's idea of what the current state is at resume time if
86the state may have changed over suspend.
87
88 68
69To create an rfkill driver, driver's Kconfig needs to have
89 70
904. Kernel API 71 depends on RFKILL || !RFKILL
91 72
92To build a driver with rfkill subsystem support, the driver should depend on 73to ensure the driver cannot be built-in when rfkill is modular. The !RFKILL
93(or select) the Kconfig symbol RFKILL. 74case allows the driver to be built when rfkill is not configured, which which
94 75case all rfkill API can still be used but will be provided by static inlines
95The hardware the driver talks to may be write-only (where the current state 76which compile to almost nothing.
96of the hardware is unknown), or read-write (where the hardware can be queried
97about its current state).
98 77
99Calling rfkill_set_hw_state() when a state change happens is required from 78Calling rfkill_set_hw_state() when a state change happens is required from
100rfkill drivers that control devices that can be hard-blocked unless they also 79rfkill drivers that control devices that can be hard-blocked unless they also
@@ -105,10 +84,35 @@ device). Don't do this unless you cannot get the event in any other way.
105 84
1065. Userspace support 855. Userspace support
107 86
108The following sysfs entries exist for every rfkill device: 87The recommended userspace interface to use is /dev/rfkill, which is a misc
88character device that allows userspace to obtain and set the state of rfkill
89devices and sets of devices. It also notifies userspace about device addition
90and removal. The API is a simple read/write API that is defined in
91linux/rfkill.h, with one ioctl that allows turning off the deprecated input
92handler in the kernel for the transition period.
93
94Except for the one ioctl, communication with the kernel is done via read()
95and write() of instances of 'struct rfkill_event'. In this structure, the
96soft and hard block are properly separated (unlike sysfs, see below) and
97userspace is able to get a consistent snapshot of all rfkill devices in the
98system. Also, it is possible to switch all rfkill drivers (or all drivers of
99a specified type) into a state which also updates the default state for
100hotplugged devices.
101
102After an application opens /dev/rfkill, it can read the current state of
103all devices, and afterwards can poll the descriptor for hotplug or state
104change events.
105
106Applications must ignore operations (the "op" field) they do not handle,
107this allows the API to be extended in the future.
108
109Additionally, each rfkill device is registered in sysfs and there has the
110following attributes:
109 111
110 name: Name assigned by driver to this key (interface or driver name). 112 name: Name assigned by driver to this key (interface or driver name).
111 type: Name of the key type ("wlan", "bluetooth", etc). 113 type: Driver type string ("wlan", "bluetooth", etc).
114 persistent: Whether the soft blocked state is initialised from
115 non-volatile storage at startup.
112 state: Current state of the transmitter 116 state: Current state of the transmitter
113 0: RFKILL_STATE_SOFT_BLOCKED 117 0: RFKILL_STATE_SOFT_BLOCKED
114 transmitter is turned off by software 118 transmitter is turned off by software
@@ -117,7 +121,12 @@ The following sysfs entries exist for every rfkill device:
117 2: RFKILL_STATE_HARD_BLOCKED 121 2: RFKILL_STATE_HARD_BLOCKED
118 transmitter is forced off by something outside of 122 transmitter is forced off by something outside of
119 the driver's control. 123 the driver's control.
120 claim: 0: Kernel handles events (currently always reads that value) 124 This file is deprecated because it can only properly show
125 three of the four possible states, soft-and-hard-blocked is
126 missing.
127 claim: 0: Kernel handles events
128 This file is deprecated because there no longer is a way to
129 claim just control over a single rfkill instance.
121 130
122rfkill devices also issue uevents (with an action of "change"), with the 131rfkill devices also issue uevents (with an action of "change"), with the
123following environment variables set: 132following environment variables set:
@@ -128,9 +137,3 @@ RFKILL_TYPE
128 137
129The contents of these variables corresponds to the "name", "state" and 138The contents of these variables corresponds to the "name", "state" and
130"type" sysfs files explained above. 139"type" sysfs files explained above.
131
132An alternative userspace interface exists as a misc device /dev/rfkill,
133which allows userspace to obtain and set the state of rfkill devices and
134sets of devices. It also notifies userspace about device addition and
135removal. The API is a simple read/write API that is defined in
136linux/rfkill.h.
diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index 535f69fab45f..fd1cd8aae4eb 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -135,7 +135,7 @@ manipulating this list), the user code must observe the following
135protocol on 'lock entry' insertion and removal: 135protocol on 'lock entry' insertion and removal:
136 136
137On insertion: 137On insertion:
138 1) set the 'list_op_pending' word to the address of the 'lock word' 138 1) set the 'list_op_pending' word to the address of the 'lock entry'
139 to be inserted, 139 to be inserted,
140 2) acquire the futex lock, 140 2) acquire the futex lock,
141 3) add the lock entry, with its thread id (TID) in the bottom 29 bits 141 3) add the lock entry, with its thread id (TID) in the bottom 29 bits
@@ -143,7 +143,7 @@ On insertion:
143 4) clear the 'list_op_pending' word. 143 4) clear the 'list_op_pending' word.
144 144
145On removal: 145On removal:
146 1) set the 'list_op_pending' word to the address of the 'lock word' 146 1) set the 'list_op_pending' word to the address of the 'lock entry'
147 to be removed, 147 to be removed,
148 2) remove the lock entry for this lock from the 'head' list, 148 2) remove the lock entry for this lock from the 'head' list,
149 2) release the futex lock, and 149 2) release the futex lock, and
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt
index 2d10053dd97e..ae66f9b90a25 100644
--- a/Documentation/s390/s390dbf.txt
+++ b/Documentation/s390/s390dbf.txt
@@ -495,6 +495,13 @@ and for each vararg a long value. So e.g. for a debug entry with a format
495string plus two varargs one would need to allocate a (3 * sizeof(long)) 495string plus two varargs one would need to allocate a (3 * sizeof(long))
496byte data area in the debug_register() function. 496byte data area in the debug_register() function.
497 497
498IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only
499use "%s" in the sprintf event functions, if the memory for the passed string is
500available as long as the debug feature exists. The reason behind this is that
501due to performance considerations only a pointer to the string is stored in
502the debug feature. If you log a string that is freed afterwards, you will get
503an OOPS when inspecting the debug feature, because then the debug feature will
504access the already freed memory.
498 505
499NOTE: If using the sprintf view do NOT use other event/exception functions 506NOTE: If using the sprintf view do NOT use other event/exception functions
500than the sprintf-event and -exception functions. 507than the sprintf-event and -exception functions.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 1df7f9cdab05..86eabe6c3419 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -73,7 +73,7 @@ The remaining CPU time will be used for user input and other tasks. Because
73realtime tasks have explicitly allocated the CPU time they need to perform 73realtime tasks have explicitly allocated the CPU time they need to perform
74their tasks, buffer underruns in the graphics or audio can be eliminated. 74their tasks, buffer underruns in the graphics or audio can be eliminated.
75 75
76NOTE: the above example is not fully implemented as of yet (2.6.25). We still 76NOTE: the above example is not fully implemented yet. We still
77lack an EDF scheduler to make non-uniform periods usable. 77lack an EDF scheduler to make non-uniform periods usable.
78 78
79 79
@@ -140,14 +140,15 @@ The other option is:
140 140
141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups") 141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
142 142
143This uses the /cgroup virtual file system and "/cgroup/<cgroup>/cpu.rt_runtime_us" 143This uses the /cgroup virtual file system and
144to control the CPU time reserved for each control group instead. 144"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
145control group instead.
145 146
146For more information on working with control groups, you should read 147For more information on working with control groups, you should read
147Documentation/cgroups/cgroups.txt as well. 148Documentation/cgroups/cgroups.txt as well.
148 149
149Group settings are checked against the following limits in order to keep the configuration 150Group settings are checked against the following limits in order to keep the
150schedulable: 151configuration schedulable:
151 152
152 \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period 153 \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
153 154
@@ -189,7 +190,7 @@ Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
189the biggest challenge as the current linux PI infrastructure is geared towards 190the biggest challenge as the current linux PI infrastructure is geared towards
190the limited static priority levels 0-99. With deadline scheduling you need to 191the limited static priority levels 0-99. With deadline scheduling you need to
191do deadline inheritance (since priority is inversely proportional to the 192do deadline inheritance (since priority is inversely proportional to the
192deadline delta (deadline - now). 193deadline delta (deadline - now)).
193 194
194This means the whole PI machinery will have to be reworked - and that is one of 195This means the whole PI machinery will have to be reworked - and that is one of
195the most complex pieces of code we have. 196the most complex pieces of code we have.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index e5b071d46619..d7f181701dc2 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -1,10 +1,11 @@
1 SCSI FC Tansport 1 SCSI FC Tansport
2 ============================================= 2 =============================================
3 3
4Date: 4/12/2007 4Date: 11/18/2008
5Kernel Revisions for features: 5Kernel Revisions for features:
6 rports : <<TBS>> 6 rports : <<TBS>>
7 vports : 2.6.22 (? TBD) 7 vports : 2.6.22
8 bsg support : 2.6.30 (?TBD?)
8 9
9 10
10Introduction 11Introduction
@@ -15,6 +16,7 @@ The FC transport can be found at:
15 drivers/scsi/scsi_transport_fc.c 16 drivers/scsi/scsi_transport_fc.c
16 include/scsi/scsi_transport_fc.h 17 include/scsi/scsi_transport_fc.h
17 include/scsi/scsi_netlink_fc.h 18 include/scsi/scsi_netlink_fc.h
19 include/scsi/scsi_bsg_fc.h
18 20
19This file is found at Documentation/scsi/scsi_fc_transport.txt 21This file is found at Documentation/scsi/scsi_fc_transport.txt
20 22
@@ -472,6 +474,14 @@ int
472fc_vport_terminate(struct fc_vport *vport) 474fc_vport_terminate(struct fc_vport *vport)
473 475
474 476
477FC BSG support (CT & ELS passthru, and more)
478========================================================================
479<< To Be Supplied >>
480
481
482
483
484
475Credits 485Credits
476======= 486=======
477The following people have contributed to this document: 487The following people have contributed to this document:
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index a6d5354639b2..de67229251d8 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -1271,6 +1271,11 @@ of interest:
1271 hostdata[0] - area reserved for LLD at end of struct Scsi_Host. Size 1271 hostdata[0] - area reserved for LLD at end of struct Scsi_Host. Size
1272 is set by the second argument (named 'xtr_bytes') to 1272 is set by the second argument (named 'xtr_bytes') to
1273 scsi_host_alloc() or scsi_register(). 1273 scsi_host_alloc() or scsi_register().
1274 vendor_id - a unique value that identifies the vendor supplying
1275 the LLD for the Scsi_Host. Used most often in validating
1276 vendor-specific message requests. Value consists of an
1277 identifier type and a vendor-specific value.
1278 See scsi_netlink.h for a description of valid formats.
1274 1279
1275The scsi_host structure is defined in include/scsi/scsi_host.h 1280The scsi_host structure is defined in include/scsi/scsi_host.h
1276 1281
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 4252697a95d6..1c8eb4518ce0 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -60,6 +60,12 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
60 slots - Reserve the slot index for the given driver. 60 slots - Reserve the slot index for the given driver.
61 This option takes multiple strings. 61 This option takes multiple strings.
62 See "Module Autoloading Support" section for details. 62 See "Module Autoloading Support" section for details.
63 debug - Specifies the debug message level
64 (0 = disable debug prints, 1 = normal debug messages,
65 2 = verbose debug messages)
66 This option appears only when CONFIG_SND_DEBUG=y.
67 This option can be dynamically changed via sysfs
68 /sys/modules/snd/parameters/debug file.
63 69
64 Module snd-pcm-oss 70 Module snd-pcm-oss
65 ------------------ 71 ------------------
@@ -513,6 +519,26 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
513 or input, but you may use this module for any application which 519 or input, but you may use this module for any application which
514 requires a sound card (like RealPlayer). 520 requires a sound card (like RealPlayer).
515 521
522 pcm_devs - Number of PCM devices assigned to each card
523 (default = 1, up to 4)
524 pcm_substreams - Number of PCM substreams assigned to each PCM
525 (default = 8, up to 16)
526 hrtimer - Use hrtimer (=1, default) or system timer (=0)
527 fake_buffer - Fake buffer allocations (default = 1)
528
529 When multiple PCM devices are created, snd-dummy gives different
530 behavior to each PCM device:
531 0 = interleaved with mmap support
532 1 = non-interleaved with mmap support
533 2 = interleaved without mmap
534 3 = non-interleaved without mmap
535
536 As default, snd-dummy drivers doesn't allocate the real buffers
537 but either ignores read/write or mmap a single dummy page to all
538 buffer pages, in order to save the resouces. If your apps need
539 the read/ written buffer data to be consistent, pass fake_buffer=0
540 option.
541
516 The power-management is supported. 542 The power-management is supported.
517 543
518 Module snd-echo3g 544 Module snd-echo3g
@@ -768,6 +794,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
768 bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. 794 bdl_pos_adj - Specifies the DMA IRQ timing delay in samples.
769 Passing -1 will make the driver to choose the appropriate 795 Passing -1 will make the driver to choose the appropriate
770 value based on the controller chip. 796 value based on the controller chip.
797 patch - Specifies the early "patch" files to modify the HD-audio
798 setup before initializing the codecs. This option is
799 available only when CONFIG_SND_HDA_PATCH_LOADER=y is set.
800 See HD-Audio.txt for details.
771 801
772 [Single (global) options] 802 [Single (global) options]
773 single_cmd - Use single immediate commands to communicate with 803 single_cmd - Use single immediate commands to communicate with
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index de8e10a94103..97eebd63bedc 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -114,8 +114,8 @@ ALC662/663/272
114 samsung-nc10 Samsung NC10 mini notebook 114 samsung-nc10 Samsung NC10 mini notebook
115 auto auto-config reading BIOS (default) 115 auto auto-config reading BIOS (default)
116 116
117ALC882/885 117ALC882/883/885/888/889
118========== 118======================
119 3stack-dig 3-jack with SPDIF I/O 119 3stack-dig 3-jack with SPDIF I/O
120 6stack-dig 6-jack digital with SPDIF I/O 120 6stack-dig 6-jack digital with SPDIF I/O
121 arima Arima W820Di1 121 arima Arima W820Di1
@@ -127,18 +127,16 @@ ALC882/885
127 mbp3 Macbook Pro rev3 127 mbp3 Macbook Pro rev3
128 imac24 iMac 24'' with jack detection 128 imac24 iMac 24'' with jack detection
129 w2jc ASUS W2JC 129 w2jc ASUS W2JC
130 auto auto-config reading BIOS (default) 130 3stack-2ch-dig 3-jack with SPDIF I/O (ALC883)
131 131 alc883-6stack-dig 6-jack digital with SPDIF I/O (ALC883)
132ALC883/888
133==========
134 3stack-dig 3-jack with SPDIF I/O
135 6stack-dig 6-jack digital with SPDIF I/O
136 3stack-6ch 3-jack 6-channel 132 3stack-6ch 3-jack 6-channel
137 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O 133 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O
138 6stack-dig-demo 6-jack digital for Intel demo board 134 6stack-dig-demo 6-jack digital for Intel demo board
139 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) 135 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
140 acer-aspire Acer Aspire 9810 136 acer-aspire Acer Aspire 9810
141 acer-aspire-4930g Acer Aspire 4930G 137 acer-aspire-4930g Acer Aspire 4930G
138 acer-aspire-6530g Acer Aspire 6530G
139 acer-aspire-7730g Acer Aspire 7730G
142 acer-aspire-8930g Acer Aspire 8930G 140 acer-aspire-8930g Acer Aspire 8930G
143 medion Medion Laptops 141 medion Medion Laptops
144 medion-md2 Medion MD2 142 medion-md2 Medion MD2
@@ -154,10 +152,13 @@ ALC883/888
154 3stack-hp HP machines with 3stack (Lucknow, Samba boards) 152 3stack-hp HP machines with 3stack (Lucknow, Samba boards)
155 6stack-dell Dell machines with 6stack (Inspiron 530) 153 6stack-dell Dell machines with 6stack (Inspiron 530)
156 mitac Mitac 8252D 154 mitac Mitac 8252D
155 clevo-m540r Clevo M540R (6ch + digital)
157 clevo-m720 Clevo M720 laptop series 156 clevo-m720 Clevo M720 laptop series
158 fujitsu-pi2515 Fujitsu AMILO Pi2515 157 fujitsu-pi2515 Fujitsu AMILO Pi2515
159 fujitsu-xa3530 Fujitsu AMILO XA3530 158 fujitsu-xa3530 Fujitsu AMILO XA3530
160 3stack-6ch-intel Intel DG33* boards 159 3stack-6ch-intel Intel DG33* boards
160 intel-alc889a Intel IbexPeak with ALC889A
161 intel-x58 Intel DX58 with ALC889
161 asus-p5q ASUS P5Q-EM boards 162 asus-p5q ASUS P5Q-EM boards
162 mb31 MacBook 3,1 163 mb31 MacBook 3,1
163 sony-vaio-tt Sony VAIO TT 164 sony-vaio-tt Sony VAIO TT
@@ -228,7 +229,7 @@ AD1984
228====== 229======
229 basic default configuration 230 basic default configuration
230 thinkpad Lenovo Thinkpad T61/X61 231 thinkpad Lenovo Thinkpad T61/X61
231 dell Dell T3400 232 dell_desktop Dell T3400
232 233
233AD1986A 234AD1986A
234======= 235=======
@@ -239,6 +240,7 @@ AD1986A
239 laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100) 240 laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100)
240 ultra 2-channel with EAPD (Samsung Ultra tablet PC) 241 ultra 2-channel with EAPD (Samsung Ultra tablet PC)
241 samsung 2-channel with EAPD (Samsung R65) 242 samsung 2-channel with EAPD (Samsung R65)
243 samsung-p50 2-channel with HP-automute (Samsung P50)
242 244
243AD1988/AD1988B/AD1989A/AD1989B 245AD1988/AD1988B/AD1989A/AD1989B
244============================== 246==============================
@@ -256,6 +258,7 @@ Conexant 5045
256 laptop-micsense Laptop with Mic sense (old model fujitsu) 258 laptop-micsense Laptop with Mic sense (old model fujitsu)
257 laptop-hpmicsense Laptop with HP and Mic senses 259 laptop-hpmicsense Laptop with HP and Mic senses
258 benq Benq R55E 260 benq Benq R55E
261 laptop-hp530 HP 530 laptop
259 test for testing/debugging purpose, almost all controls 262 test for testing/debugging purpose, almost all controls
260 can be adjusted. Appearing only when compiled with 263 can be adjusted. Appearing only when compiled with
261 $CONFIG_SND_DEBUG=y 264 $CONFIG_SND_DEBUG=y
@@ -276,9 +279,16 @@ Conexant 5051
276 hp-dv6736 HP dv6736 279 hp-dv6736 HP dv6736
277 lenovo-x200 Lenovo X200 laptop 280 lenovo-x200 Lenovo X200 laptop
278 281
282Conexant 5066
283=============
284 laptop Basic Laptop config (default)
285 dell-laptop Dell laptops
286 olpc-xo-1_5 OLPC XO 1.5
287
279STAC9200 288STAC9200
280======== 289========
281 ref Reference board 290 ref Reference board
291 oqo OQO Model 2
282 dell-d21 Dell (unknown) 292 dell-d21 Dell (unknown)
283 dell-d22 Dell (unknown) 293 dell-d22 Dell (unknown)
284 dell-d23 Dell (unknown) 294 dell-d23 Dell (unknown)
@@ -366,10 +376,12 @@ STAC92HD73*
366=========== 376===========
367 ref Reference board 377 ref Reference board
368 no-jd BIOS setup but without jack-detection 378 no-jd BIOS setup but without jack-detection
379 intel Intel DG45* mobos
369 dell-m6-amic Dell desktops/laptops with analog mics 380 dell-m6-amic Dell desktops/laptops with analog mics
370 dell-m6-dmic Dell desktops/laptops with digital mics 381 dell-m6-dmic Dell desktops/laptops with digital mics
371 dell-m6 Dell desktops/laptops with both type of mics 382 dell-m6 Dell desktops/laptops with both type of mics
372 dell-eq Dell desktops/laptops 383 dell-eq Dell desktops/laptops
384 alienware Alienware M17x
373 auto BIOS setup (default) 385 auto BIOS setup (default)
374 386
375STAC92HD83* 387STAC92HD83*
@@ -383,3 +395,8 @@ STAC9872
383======== 395========
384 vaio VAIO laptop without SPDIF 396 vaio VAIO laptop without SPDIF
385 auto BIOS setup (default) 397 auto BIOS setup (default)
398
399Cirrus Logic CS4206/4207
400========================
401 mbp55 MacBook Pro 5,5
402 auto BIOS setup (default)
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 71ac995b1915..7b8a5f947d1d 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -139,6 +139,10 @@ The driver checks PCI SSID and looks through the static configuration
139table until any matching entry is found. If you have a new machine, 139table until any matching entry is found. If you have a new machine,
140you may see a message like below: 140you may see a message like below:
141------------------------------------------------------------------------ 141------------------------------------------------------------------------
142 hda_codec: ALC880: BIOS auto-probing.
143------------------------------------------------------------------------
144Meanwhile, in the earlier versions, you would see a message like:
145------------------------------------------------------------------------
142 hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... 146 hda_codec: Unknown model for ALC880, trying auto-probe from BIOS...
143------------------------------------------------------------------------ 147------------------------------------------------------------------------
144Even if you see such a message, DON'T PANIC. Take a deep breath and 148Even if you see such a message, DON'T PANIC. Take a deep breath and
@@ -403,6 +407,66 @@ re-configure based on that state, run like below:
403------------------------------------------------------------------------ 407------------------------------------------------------------------------
404 408
405 409
410Early Patching
411~~~~~~~~~~~~~~
412When CONFIG_SND_HDA_PATCH_LOADER=y is set, you can pass a "patch" as a
413firmware file for modifying the HD-audio setup before initializing the
414codec. This can work basically like the reconfiguration via sysfs in
415the above, but it does it before the first codec configuration.
416
417A patch file is a plain text file which looks like below:
418
419------------------------------------------------------------------------
420 [codec]
421 0x12345678 0xabcd1234 2
422
423 [model]
424 auto
425
426 [pincfg]
427 0x12 0x411111f0
428
429 [verb]
430 0x20 0x500 0x03
431 0x20 0x400 0xff
432
433 [hint]
434 hp_detect = yes
435------------------------------------------------------------------------
436
437The file needs to have a line `[codec]`. The next line should contain
438three numbers indicating the codec vendor-id (0x12345678 in the
439example), the codec subsystem-id (0xabcd1234) and the address (2) of
440the codec. The rest patch entries are applied to this specified codec
441until another codec entry is given.
442
443The `[model]` line allows to change the model name of the each codec.
444In the example above, it will be changed to model=auto.
445Note that this overrides the module option.
446
447After the `[pincfg]` line, the contents are parsed as the initial
448default pin-configurations just like `user_pin_configs` sysfs above.
449The values can be shown in user_pin_configs sysfs file, too.
450
451Similarly, the lines after `[verb]` are parsed as `init_verbs`
452sysfs entries, and the lines after `[hint]` are parsed as `hints`
453sysfs entries, respectively.
454
455The hd-audio driver reads the file via request_firmware(). Thus,
456a patch file has to be located on the appropriate firmware path,
457typically, /lib/firmware. For example, when you pass the option
458`patch=hda-init.fw`, the file /lib/firmware/hda-init-fw must be
459present.
460
461The patch module option is specific to each card instance, and you
462need to give one file name for each instance, separated by commas.
463For example, if you have two cards, one for an on-board analog and one
464for an HDMI video board, you may pass patch option like below:
465------------------------------------------------------------------------
466 options snd-hda-intel patch=on-board-patch,hdmi-patch
467------------------------------------------------------------------------
468
469
406Power-Saving 470Power-Saving
407~~~~~~~~~~~~ 471~~~~~~~~~~~~
408The power-saving is a kind of auto-suspend of the device. When the 472The power-saving is a kind of auto-suspend of the device. When the
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index 381908d8ca42..719a819f8cc2 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -101,6 +101,8 @@ card*/pcm*/xrun_debug
101 bit 0 = Enable XRUN/jiffies debug messages 101 bit 0 = Enable XRUN/jiffies debug messages
102 bit 1 = Show stack trace at XRUN / jiffies check 102 bit 1 = Show stack trace at XRUN / jiffies check
103 bit 2 = Enable additional jiffies check 103 bit 2 = Enable additional jiffies check
104 bit 3 = Log hwptr update at each period interrupt
105 bit 4 = Log hwptr update at each snd_pcm_update_hw_ptr()
104 106
105 When the bit 0 is set, the driver will show the messages to 107 When the bit 0 is set, the driver will show the messages to
106 kernel log when an xrun is detected. The debug message is 108 kernel log when an xrun is detected. The debug message is
@@ -117,6 +119,9 @@ card*/pcm*/xrun_debug
117 buggy) hardware that doesn't give smooth pointer updates. 119 buggy) hardware that doesn't give smooth pointer updates.
118 This feature is enabled via the bit 2. 120 This feature is enabled via the bit 2.
119 121
122 Bits 3 and 4 are for logging the hwptr records. Note that
123 these will give flood of kernel messages.
124
120card*/pcm*/sub*/info 125card*/pcm*/sub*/info
121 The general information of this PCM sub-stream. 126 The general information of this PCM sub-stream.
122 127
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index cf0e3ce0d526..c1a5aad3c75a 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -99,11 +99,13 @@ void parse_opts(int argc, char *argv[])
99 { "lsb", 0, 0, 'L' }, 99 { "lsb", 0, 0, 'L' },
100 { "cs-high", 0, 0, 'C' }, 100 { "cs-high", 0, 0, 'C' },
101 { "3wire", 0, 0, '3' }, 101 { "3wire", 0, 0, '3' },
102 { "no-cs", 0, 0, 'N' },
103 { "ready", 0, 0, 'R' },
102 { NULL, 0, 0, 0 }, 104 { NULL, 0, 0, 0 },
103 }; 105 };
104 int c; 106 int c;
105 107
106 c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL); 108 c = getopt_long(argc, argv, "D:s:d:b:lHOLC3NR", lopts, NULL);
107 109
108 if (c == -1) 110 if (c == -1)
109 break; 111 break;
@@ -139,6 +141,12 @@ void parse_opts(int argc, char *argv[])
139 case '3': 141 case '3':
140 mode |= SPI_3WIRE; 142 mode |= SPI_3WIRE;
141 break; 143 break;
144 case 'N':
145 mode |= SPI_NO_CS;
146 break;
147 case 'R':
148 mode |= SPI_READY;
149 break;
142 default: 150 default:
143 print_usage(argv[0]); 151 print_usage(argv[0]);
144 break; 152 break;
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 322a00bb99d9..2dbff53369d0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -19,6 +19,7 @@ Currently, these files might (depending on your configuration)
19show up in /proc/sys/kernel: 19show up in /proc/sys/kernel:
20- acpi_video_flags 20- acpi_video_flags
21- acct 21- acct
22- callhome [ S390 only ]
22- auto_msgmni 23- auto_msgmni
23- core_pattern 24- core_pattern
24- core_uses_pid 25- core_uses_pid
@@ -91,6 +92,21 @@ valid for 30 seconds.
91 92
92============================================================== 93==============================================================
93 94
95callhome:
96
97Controls the kernel's callhome behavior in case of a kernel panic.
98
99The s390 hardware allows an operating system to send a notification
100to a service organization (callhome) in case of an operating system panic.
101
102When the value in this file is 0 (which is the default behavior)
103nothing happens in case of a kernel panic. If this value is set to "1"
104the complete kernel oops message is send to the IBM customer service
105organization in case the mainframe the Linux operating system is running
106on has a service contract with IBM.
107
108==============================================================
109
94core_pattern: 110core_pattern:
95 111
96core_pattern is used to specify a core dumpfile pattern name. 112core_pattern is used to specify a core dumpfile pattern name.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6fab2dcbb4d3..c4de6359d440 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used
233for page allocation or should be reclaimed. 233for page allocation or should be reclaimed.
234 234
235In this example, if normal pages (index=2) are required to this DMA zone and 235In this example, if normal pages (index=2) are required to this DMA zone and
236pages_high is used for watermark, the kernel judges this zone should not be 236watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
237used because pages_free(1355) is smaller than watermark + protection[2] 237not be used because pages_free(1355) is smaller than watermark + protection[2]
238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for 238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
239normal page requirement. If requirement is DMA zone(index=0), protection[0] 239normal page requirement. If requirement is DMA zone(index=0), protection[0]
240(=0) is used. 240(=0) is used.
@@ -280,9 +280,10 @@ The default value is 65536.
280min_free_kbytes: 280min_free_kbytes:
281 281
282This is used to force the Linux VM to keep a minimum number 282This is used to force the Linux VM to keep a minimum number
283of kilobytes free. The VM uses this number to compute a pages_min 283of kilobytes free. The VM uses this number to compute a
284value for each lowmem zone in the system. Each lowmem zone gets 284watermark[WMARK_MIN] value for each lowmem zone in the system.
285a number of reserved free pages based proportionally on its size. 285Each lowmem zone gets a number of reserved free pages based
286proportionally on its size.
286 287
287Some minimal amount of memory is needed to satisfy PF_MEMALLOC 288Some minimal amount of memory is needed to satisfy PF_MEMALLOC
288allocations; if you set this to lower than 1024KB, your system will 289allocations; if you set this to lower than 1024KB, your system will
@@ -314,10 +315,14 @@ min_unmapped_ratio:
314 315
315This is available only on NUMA kernels. 316This is available only on NUMA kernels.
316 317
317A percentage of the total pages in each zone. Zone reclaim will only 318This is a percentage of the total pages in each zone. Zone reclaim will
318occur if more than this percentage of pages are file backed and unmapped. 319only occur if more than this percentage of pages are in a state that
319This is to insure that a minimal amount of local pages is still available for 320zone_reclaim_mode allows to be reclaimed.
320file I/O even if the node is overallocated. 321
322If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
323against all file-backed unmapped pages including swapcache pages and tmpfs
324files. Otherwise, only unmapped pages backed by normal files but not tmpfs
325files and similar are considered.
321 326
322The default is 1 percent. 327The default is 1 percent.
323 328
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index cf42b820ff9d..d56a01775423 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -66,7 +66,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
66'b' - Will immediately reboot the system without syncing or unmounting 66'b' - Will immediately reboot the system without syncing or unmounting
67 your disks. 67 your disks.
68 68
69'c' - Will perform a kexec reboot in order to take a crashdump. 69'c' - Will perform a system crash by a NULL pointer dereference.
70 A crashdump will be taken if configured.
70 71
71'd' - Shows all locks that are held. 72'd' - Shows all locks that are held.
72 73
@@ -141,8 +142,8 @@ useful when you want to exit a program that will not let you switch consoles.
141re'B'oot is good when you're unable to shut down. But you should also 'S'ync 142re'B'oot is good when you're unable to shut down. But you should also 'S'ync
142and 'U'mount first. 143and 'U'mount first.
143 144
144'C'rashdump can be used to manually trigger a crashdump when the system is hung. 145'C'rash can be used to manually trigger a crashdump when the system is hung.
145The kernel needs to have been built with CONFIG_KEXEC enabled. 146Note that this just triggers a crash if there is no dump mechanism available.
146 147
147'S'ync is great when your system is locked up, it allows you to sync your 148'S'ync is great when your system is locked up, it allows you to sync your
148disks and will certainly lessen the chance of data loss and fscking. Note 149disks and will certainly lessen the chance of data loss and fscking. Note
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index f157d7594ea7..78c45a87be57 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -1,7 +1,7 @@
1 Event Tracing 1 Event Tracing
2 2
3 Documentation written by Theodore Ts'o 3 Documentation written by Theodore Ts'o
4 Updated by Li Zefan 4 Updated by Li Zefan and Tom Zanussi
5 5
61. Introduction 61. Introduction
7=============== 7===============
@@ -22,12 +22,12 @@ tracing information should be printed.
22--------------------------------- 22---------------------------------
23 23
24The events which are available for tracing can be found in the file 24The events which are available for tracing can be found in the file
25/debug/tracing/available_events. 25/sys/kernel/debug/tracing/available_events.
26 26
27To enable a particular event, such as 'sched_wakeup', simply echo it 27To enable a particular event, such as 'sched_wakeup', simply echo it
28to /debug/tracing/set_event. For example: 28to /sys/kernel/debug/tracing/set_event. For example:
29 29
30 # echo sched_wakeup >> /debug/tracing/set_event 30 # echo sched_wakeup >> /sys/kernel/debug/tracing/set_event
31 31
32[ Note: '>>' is necessary, otherwise it will firstly disable 32[ Note: '>>' is necessary, otherwise it will firstly disable
33 all the events. ] 33 all the events. ]
@@ -35,15 +35,15 @@ to /debug/tracing/set_event. For example:
35To disable an event, echo the event name to the set_event file prefixed 35To disable an event, echo the event name to the set_event file prefixed
36with an exclamation point: 36with an exclamation point:
37 37
38 # echo '!sched_wakeup' >> /debug/tracing/set_event 38 # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
39 39
40To disable all events, echo an empty line to the set_event file: 40To disable all events, echo an empty line to the set_event file:
41 41
42 # echo > /debug/tracing/set_event 42 # echo > /sys/kernel/debug/tracing/set_event
43 43
44To enable all events, echo '*:*' or '*:' to the set_event file: 44To enable all events, echo '*:*' or '*:' to the set_event file:
45 45
46 # echo *:* > /debug/tracing/set_event 46 # echo *:* > /sys/kernel/debug/tracing/set_event
47 47
48The events are organized into subsystems, such as ext4, irq, sched, 48The events are organized into subsystems, such as ext4, irq, sched,
49etc., and a full event name looks like this: <subsystem>:<event>. The 49etc., and a full event name looks like this: <subsystem>:<event>. The
@@ -52,29 +52,29 @@ file. All of the events in a subsystem can be specified via the syntax
52"<subsystem>:*"; for example, to enable all irq events, you can use the 52"<subsystem>:*"; for example, to enable all irq events, you can use the
53command: 53command:
54 54
55 # echo 'irq:*' > /debug/tracing/set_event 55 # echo 'irq:*' > /sys/kernel/debug/tracing/set_event
56 56
572.2 Via the 'enable' toggle 572.2 Via the 'enable' toggle
58--------------------------- 58---------------------------
59 59
60The events available are also listed in /debug/tracing/events/ hierarchy 60The events available are also listed in /sys/kernel/debug/tracing/events/ hierarchy
61of directories. 61of directories.
62 62
63To enable event 'sched_wakeup': 63To enable event 'sched_wakeup':
64 64
65 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable 65 # echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
66 66
67To disable it: 67To disable it:
68 68
69 # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable 69 # echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
70 70
71To enable all events in sched subsystem: 71To enable all events in sched subsystem:
72 72
73 # echo 1 > /debug/tracing/events/sched/enable 73 # echo 1 > /sys/kernel/debug/tracing/events/sched/enable
74 74
75To eanble all events: 75To eanble all events:
76 76
77 # echo 1 > /debug/tracing/events/enable 77 # echo 1 > /sys/kernel/debug/tracing/events/enable
78 78
79When reading one of these enable files, there are four results: 79When reading one of these enable files, there are four results:
80 80
@@ -83,8 +83,199 @@ When reading one of these enable files, there are four results:
83 X - there is a mixture of events enabled and disabled 83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event 84 ? - this file does not affect any event
85 85
862.3 Boot option
87---------------
88
89In order to facilitate early boot debugging, use boot option:
90
91 trace_event=[event-list]
92
93The format of this boot option is the same as described in section 2.1.
94
863. Defining an event-enabled tracepoint 953. Defining an event-enabled tracepoint
87======================================= 96=======================================
88 97
89See The example provided in samples/trace_events 98See The example provided in samples/trace_events
90 99
1004. Event formats
101================
102
103Each trace event has a 'format' file associated with it that contains
104a description of each field in a logged event. This information can
105be used to parse the binary trace stream, and is also the place to
106find the field names that can be used in event filters (see section 5).
107
108It also displays the format string that will be used to print the
109event in text mode, along with the event name and ID used for
110profiling.
111
112Every event has a set of 'common' fields associated with it; these are
113the fields prefixed with 'common_'. The other fields vary between
114events and correspond to the fields defined in the TRACE_EVENT
115definition for that event.
116
117Each field in the format has the form:
118
119 field:field-type field-name; offset:N; size:N;
120
121where offset is the offset of the field in the trace record and size
122is the size of the data item, in bytes.
123
124For example, here's the information displayed for the 'sched_wakeup'
125event:
126
127# cat /debug/tracing/events/sched/sched_wakeup/format
128
129name: sched_wakeup
130ID: 60
131format:
132 field:unsigned short common_type; offset:0; size:2;
133 field:unsigned char common_flags; offset:2; size:1;
134 field:unsigned char common_preempt_count; offset:3; size:1;
135 field:int common_pid; offset:4; size:4;
136 field:int common_tgid; offset:8; size:4;
137
138 field:char comm[TASK_COMM_LEN]; offset:12; size:16;
139 field:pid_t pid; offset:28; size:4;
140 field:int prio; offset:32; size:4;
141 field:int success; offset:36; size:4;
142 field:int cpu; offset:40; size:4;
143
144print fmt: "task %s:%d [%d] success=%d [%03d]", REC->comm, REC->pid,
145 REC->prio, REC->success, REC->cpu
146
147This event contains 10 fields, the first 5 common and the remaining 5
148event-specific. All the fields for this event are numeric, except for
149'comm' which is a string, a distinction important for event filtering.
150
1515. Event filtering
152==================
153
154Trace events can be filtered in the kernel by associating boolean
155'filter expressions' with them. As soon as an event is logged into
156the trace buffer, its fields are checked against the filter expression
157associated with that event type. An event with field values that
158'match' the filter will appear in the trace output, and an event whose
159values don't match will be discarded. An event with no filter
160associated with it matches everything, and is the default when no
161filter has been set for an event.
162
1635.1 Expression syntax
164---------------------
165
166A filter expression consists of one or more 'predicates' that can be
167combined using the logical operators '&&' and '||'. A predicate is
168simply a clause that compares the value of a field contained within a
169logged event with a constant value and returns either 0 or 1 depending
170on whether the field value matched (1) or didn't match (0):
171
172 field-name relational-operator value
173
174Parentheses can be used to provide arbitrary logical groupings and
175double-quotes can be used to prevent the shell from interpreting
176operators as shell metacharacters.
177
178The field-names available for use in filters can be found in the
179'format' files for trace events (see section 4).
180
181The relational-operators depend on the type of the field being tested:
182
183The operators available for numeric fields are:
184
185==, !=, <, <=, >, >=
186
187And for string fields they are:
188
189==, !=
190
191Currently, only exact string matches are supported.
192
193Currently, the maximum number of predicates in a filter is 16.
194
1955.2 Setting filters
196-------------------
197
198A filter for an individual event is set by writing a filter expression
199to the 'filter' file for the given event.
200
201For example:
202
203# cd /debug/tracing/events/sched/sched_wakeup
204# echo "common_preempt_count > 4" > filter
205
206A slightly more involved example:
207
208# cd /debug/tracing/events/sched/sched_signal_send
209# echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter
210
211If there is an error in the expression, you'll get an 'Invalid
212argument' error when setting it, and the erroneous string along with
213an error message can be seen by looking at the filter e.g.:
214
215# cd /debug/tracing/events/sched/sched_signal_send
216# echo "((sig >= 10 && sig < 15) || dsig == 17) && comm != bash" > filter
217-bash: echo: write error: Invalid argument
218# cat filter
219((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
220^
221parse_error: Field not found
222
223Currently the caret ('^') for an error always appears at the beginning of
224the filter string; the error message should still be useful though
225even without more accurate position info.
226
2275.3 Clearing filters
228--------------------
229
230To clear the filter for an event, write a '0' to the event's filter
231file.
232
233To clear the filters for all events in a subsystem, write a '0' to the
234subsystem's filter file.
235
2365.3 Subsystem filters
237---------------------
238
239For convenience, filters for every event in a subsystem can be set or
240cleared as a group by writing a filter expression into the filter file
241at the root of the subsytem. Note however, that if a filter for any
242event within the subsystem lacks a field specified in the subsystem
243filter, or if the filter can't be applied for any other reason, the
244filter for that event will retain its previous setting. This can
245result in an unintended mixture of filters which could lead to
246confusing (to the user who might think different filters are in
247effect) trace output. Only filters that reference just the common
248fields can be guaranteed to propagate successfully to all events.
249
250Here are a few subsystem filter examples that also illustrate the
251above points:
252
253Clear the filters on all events in the sched subsytem:
254
255# cd /sys/kernel/debug/tracing/events/sched
256# echo 0 > filter
257# cat sched_switch/filter
258none
259# cat sched_wakeup/filter
260none
261
262Set a filter using only common fields for all events in the sched
263subsytem (all events end up with the same filter):
264
265# cd /sys/kernel/debug/tracing/events/sched
266# echo common_pid == 0 > filter
267# cat sched_switch/filter
268common_pid == 0
269# cat sched_wakeup/filter
270common_pid == 0
271
272Attempt to set a filter using a non-common field for all events in the
273sched subsytem (all events but those that have a prev_pid field retain
274their old filters):
275
276# cd /sys/kernel/debug/tracing/events/sched
277# echo prev_pid == 0 > filter
278# cat sched_switch/filter
279prev_pid == 0
280# cat sched_wakeup/filter
281common_pid == 0
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
new file mode 100644
index 000000000000..7003e10f10f5
--- /dev/null
+++ b/Documentation/trace/ftrace-design.txt
@@ -0,0 +1,233 @@
1 function tracer guts
2 ====================
3
4Introduction
5------------
6
7Here we will cover the architecture pieces that the common function tracing
8code relies on for proper functioning. Things are broken down into increasing
9complexity so that you can start simple and at least get basic functionality.
10
11Note that this focuses on architecture implementation details only. If you
12want more explanation of a feature in terms of common code, review the common
13ftrace.txt file.
14
15
16Prerequisites
17-------------
18
19Ftrace relies on these features being implemented:
20 STACKTRACE_SUPPORT - implement save_stack_trace()
21 TRACE_IRQFLAGS_SUPPORT - implement include/asm/irqflags.h
22
23
24HAVE_FUNCTION_TRACER
25--------------------
26
27You will need to implement the mcount and the ftrace_stub functions.
28
29The exact mcount symbol name will depend on your toolchain. Some call it
30"mcount", "_mcount", or even "__mcount". You can probably figure it out by
31running something like:
32 $ echo 'main(){}' | gcc -x c -S -o - - -pg | grep mcount
33 call mcount
34We'll make the assumption below that the symbol is "mcount" just to keep things
35nice and simple in the examples.
36
37Keep in mind that the ABI that is in effect inside of the mcount function is
38*highly* architecture/toolchain specific. We cannot help you in this regard,
39sorry. Dig up some old documentation and/or find someone more familiar than
40you to bang ideas off of. Typically, register usage (argument/scratch/etc...)
41is a major issue at this point, especially in relation to the location of the
42mcount call (before/after function prologue). You might also want to look at
43how glibc has implemented the mcount function for your architecture. It might
44be (semi-)relevant.
45
46The mcount function should check the function pointer ftrace_trace_function
47to see if it is set to ftrace_stub. If it is, there is nothing for you to do,
48so return immediately. If it isn't, then call that function in the same way
49the mcount function normally calls __mcount_internal -- the first argument is
50the "frompc" while the second argument is the "selfpc" (adjusted to remove the
51size of the mcount call that is embedded in the function).
52
53For example, if the function foo() calls bar(), when the bar() function calls
54mcount(), the arguments mcount() will pass to the tracer are:
55 "frompc" - the address bar() will use to return to foo()
56 "selfpc" - the address bar() (with _mcount() size adjustment)
57
58Also keep in mind that this mcount function will be called *a lot*, so
59optimizing for the default case of no tracer will help the smooth running of
60your system when tracing is disabled. So the start of the mcount function is
61typically the bare min with checking things before returning. That also means
62the code flow should usually kept linear (i.e. no branching in the nop case).
63This is of course an optimization and not a hard requirement.
64
65Here is some pseudo code that should help (these functions should actually be
66implemented in assembly):
67
68void ftrace_stub(void)
69{
70 return;
71}
72
73void mcount(void)
74{
75 /* save any bare state needed in order to do initial checking */
76
77 extern void (*ftrace_trace_function)(unsigned long, unsigned long);
78 if (ftrace_trace_function != ftrace_stub)
79 goto do_trace;
80
81 /* restore any bare state */
82
83 return;
84
85do_trace:
86
87 /* save all state needed by the ABI (see paragraph above) */
88
89 unsigned long frompc = ...;
90 unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
91 ftrace_trace_function(frompc, selfpc);
92
93 /* restore all state needed by the ABI */
94}
95
96Don't forget to export mcount for modules !
97extern void mcount(void);
98EXPORT_SYMBOL(mcount);
99
100
101HAVE_FUNCTION_TRACE_MCOUNT_TEST
102-------------------------------
103
104This is an optional optimization for the normal case when tracing is turned off
105in the system. If you do not enable this Kconfig option, the common ftrace
106code will take care of doing the checking for you.
107
108To support this feature, you only need to check the function_trace_stop
109variable in the mcount function. If it is non-zero, there is no tracing to be
110done at all, so you can return.
111
112This additional pseudo code would simply be:
113void mcount(void)
114{
115 /* save any bare state needed in order to do initial checking */
116
117+ if (function_trace_stop)
118+ return;
119
120 extern void (*ftrace_trace_function)(unsigned long, unsigned long);
121 if (ftrace_trace_function != ftrace_stub)
122...
123
124
125HAVE_FUNCTION_GRAPH_TRACER
126--------------------------
127
128Deep breath ... time to do some real work. Here you will need to update the
129mcount function to check ftrace graph function pointers, as well as implement
130some functions to save (hijack) and restore the return address.
131
132The mcount function should check the function pointers ftrace_graph_return
133(compare to ftrace_stub) and ftrace_graph_entry (compare to
134ftrace_graph_entry_stub). If either of those are not set to the relevant stub
135function, call the arch-specific function ftrace_graph_caller which in turn
136calls the arch-specific function prepare_ftrace_return. Neither of these
137function names are strictly required, but you should use them anyways to stay
138consistent across the architecture ports -- easier to compare & contrast
139things.
140
141The arguments to prepare_ftrace_return are slightly different than what are
142passed to ftrace_trace_function. The second argument "selfpc" is the same,
143but the first argument should be a pointer to the "frompc". Typically this is
144located on the stack. This allows the function to hijack the return address
145temporarily to have it point to the arch-specific function return_to_handler.
146That function will simply call the common ftrace_return_to_handler function and
147that will return the original return address with which, you can return to the
148original call site.
149
150Here is the updated mcount pseudo code:
151void mcount(void)
152{
153...
154 if (ftrace_trace_function != ftrace_stub)
155 goto do_trace;
156
157+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
158+ extern void (*ftrace_graph_return)(...);
159+ extern void (*ftrace_graph_entry)(...);
160+ if (ftrace_graph_return != ftrace_stub ||
161+ ftrace_graph_entry != ftrace_graph_entry_stub)
162+ ftrace_graph_caller();
163+#endif
164
165 /* restore any bare state */
166...
167
168Here is the pseudo code for the new ftrace_graph_caller assembly function:
169#ifdef CONFIG_FUNCTION_GRAPH_TRACER
170void ftrace_graph_caller(void)
171{
172 /* save all state needed by the ABI */
173
174 unsigned long *frompc = &...;
175 unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
176 prepare_ftrace_return(frompc, selfpc);
177
178 /* restore all state needed by the ABI */
179}
180#endif
181
182For information on how to implement prepare_ftrace_return(), simply look at
183the x86 version. The only architecture-specific piece in it is the setup of
184the fault recovery table (the asm(...) code). The rest should be the same
185across architectures.
186
187Here is the pseudo code for the new return_to_handler assembly function. Note
188that the ABI that applies here is different from what applies to the mcount
189code. Since you are returning from a function (after the epilogue), you might
190be able to skimp on things saved/restored (usually just registers used to pass
191return values).
192
193#ifdef CONFIG_FUNCTION_GRAPH_TRACER
194void return_to_handler(void)
195{
196 /* save all state needed by the ABI (see paragraph above) */
197
198 void (*original_return_point)(void) = ftrace_return_to_handler();
199
200 /* restore all state needed by the ABI */
201
202 /* this is usually either a return or a jump */
203 original_return_point();
204}
205#endif
206
207
208HAVE_FTRACE_NMI_ENTER
209---------------------
210
211If you can't trace NMI functions, then skip this option.
212
213<details to be filled>
214
215
216HAVE_FTRACE_SYSCALLS
217---------------------
218
219<details to be filled>
220
221
222HAVE_FTRACE_MCOUNT_RECORD
223-------------------------
224
225See scripts/recordmcount.pl for more info.
226
227<details to be filled>
228
229
230HAVE_DYNAMIC_FTRACE
231---------------------
232
233<details to be filled>
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 7bd27f0e2880..1b6292bbdd6d 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -7,7 +7,6 @@ Copyright 2008 Red Hat Inc.
7 (dual licensed under the GPL v2) 7 (dual licensed under the GPL v2)
8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton, 8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton,
9 John Kacur, and David Teigland. 9 John Kacur, and David Teigland.
10
11Written for: 2.6.28-rc2 10Written for: 2.6.28-rc2
12 11
13Introduction 12Introduction
@@ -27,19 +26,38 @@ disabled, and more (ftrace allows for tracer plugins, which
27means that the list of tracers can always grow). 26means that the list of tracers can always grow).
28 27
29 28
29Implementation Details
30----------------------
31
32See ftrace-design.txt for details for arch porters and such.
33
34
30The File System 35The File System
31--------------- 36---------------
32 37
33Ftrace uses the debugfs file system to hold the control files as 38Ftrace uses the debugfs file system to hold the control files as
34well as the files to display output. 39well as the files to display output.
35 40
36To mount the debugfs system: 41When debugfs is configured into the kernel (which selecting any ftrace
42option will do) the directory /sys/kernel/debug will be created. To mount
43this directory, you can add to your /etc/fstab file:
44
45 debugfs /sys/kernel/debug debugfs defaults 0 0
37 46
38 # mkdir /debug 47Or you can mount it at run time with:
39 # mount -t debugfs nodev /debug
40 48
41( Note: it is more common to mount at /sys/kernel/debug, but for 49 mount -t debugfs nodev /sys/kernel/debug
42 simplicity this document will use /debug) 50
51For quicker access to that directory you may want to make a soft link to
52it:
53
54 ln -s /sys/kernel/debug /debug
55
56Any selected ftrace option will also create a directory called tracing
57within the debugfs. The rest of the document will assume that you are in
58the ftrace directory (cd /sys/kernel/debug/tracing) and will only concentrate
59on the files within that directory and not distract from the content with
60the extended "/sys/kernel/debug/tracing" path name.
43 61
44That's it! (assuming that you have ftrace configured into your kernel) 62That's it! (assuming that you have ftrace configured into your kernel)
45 63
@@ -73,26 +91,19 @@ of ftrace. Here is a list of some of the key files:
73 This file holds the output of the trace in a human 91 This file holds the output of the trace in a human
74 readable format (described below). 92 readable format (described below).
75 93
76 latency_trace:
77
78 This file shows the same trace but the information
79 is organized more to display possible latencies
80 in the system (described below).
81
82 trace_pipe: 94 trace_pipe:
83 95
84 The output is the same as the "trace" file but this 96 The output is the same as the "trace" file but this
85 file is meant to be streamed with live tracing. 97 file is meant to be streamed with live tracing.
86 Reads from this file will block until new data 98 Reads from this file will block until new data is
87 is retrieved. Unlike the "trace" and "latency_trace" 99 retrieved. Unlike the "trace" file, this file is a
88 files, this file is a consumer. This means reading 100 consumer. This means reading from this file causes
89 from this file causes sequential reads to display 101 sequential reads to display more current data. Once
90 more current data. Once data is read from this 102 data is read from this file, it is consumed, and
91 file, it is consumed, and will not be read 103 will not be read again with a sequential read. The
92 again with a sequential read. The "trace" and 104 "trace" file is static, and if the tracer is not
93 "latency_trace" files are static, and if the 105 adding more data,they will display the same
94 tracer is not adding more data, they will display 106 information every time they are read.
95 the same information every time they are read.
96 107
97 trace_options: 108 trace_options:
98 109
@@ -105,10 +116,10 @@ of ftrace. Here is a list of some of the key files:
105 Some of the tracers record the max latency. 116 Some of the tracers record the max latency.
106 For example, the time interrupts are disabled. 117 For example, the time interrupts are disabled.
107 This time is saved in this file. The max trace 118 This time is saved in this file. The max trace
108 will also be stored, and displayed by either 119 will also be stored, and displayed by "trace".
109 "trace" or "latency_trace". A new max trace will 120 A new max trace will only be recorded if the
110 only be recorded if the latency is greater than 121 latency is greater than the value in this
111 the value in this file. (in microseconds) 122 file. (in microseconds)
112 123
113 buffer_size_kb: 124 buffer_size_kb:
114 125
@@ -198,7 +209,7 @@ Here is the list of current tracers that may be configured.
198 the trace with the longest max latency. 209 the trace with the longest max latency.
199 See tracing_max_latency. When a new max is recorded, 210 See tracing_max_latency. When a new max is recorded,
200 it replaces the old trace. It is best to view this 211 it replaces the old trace. It is best to view this
201 trace via the latency_trace file. 212 trace with the latency-format option enabled.
202 213
203 "preemptoff" 214 "preemptoff"
204 215
@@ -295,8 +306,8 @@ the lowest priority thread (pid 0).
295Latency trace format 306Latency trace format
296-------------------- 307--------------------
297 308
298For traces that display latency times, the latency_trace file 309When the latency-format option is enabled, the trace file gives
299gives somewhat more information to see why a latency happened. 310somewhat more information to see why a latency happened.
300Here is a typical trace. 311Here is a typical trace.
301 312
302# tracer: irqsoff 313# tracer: irqsoff
@@ -368,9 +379,10 @@ explains which is which.
368 379
369The above is mostly meaningful for kernel developers. 380The above is mostly meaningful for kernel developers.
370 381
371 time: This differs from the trace file output. The trace file output 382 time: When the latency-format option is enabled, the trace file
372 includes an absolute timestamp. The timestamp used by the 383 output includes a timestamp relative to the start of the
373 latency_trace file is relative to the start of the trace. 384 trace. This differs from the output when latency-format
385 is disabled, which includes an absolute timestamp.
374 386
375 delay: This is just to help catch your eye a bit better. And 387 delay: This is just to help catch your eye a bit better. And
376 needs to be fixed to be only relative to the same CPU. 388 needs to be fixed to be only relative to the same CPU.
@@ -389,18 +401,18 @@ trace_options
389The trace_options file is used to control what gets printed in 401The trace_options file is used to control what gets printed in
390the trace output. To see what is available, simply cat the file: 402the trace output. To see what is available, simply cat the file:
391 403
392 cat /debug/tracing/trace_options 404 cat trace_options
393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 405 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 406 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
395 407
396To disable one of the options, echo in the option prepended with 408To disable one of the options, echo in the option prepended with
397"no". 409"no".
398 410
399 echo noprint-parent > /debug/tracing/trace_options 411 echo noprint-parent > trace_options
400 412
401To enable an option, leave off the "no". 413To enable an option, leave off the "no".
402 414
403 echo sym-offset > /debug/tracing/trace_options 415 echo sym-offset > trace_options
404 416
405Here are the available options: 417Here are the available options:
406 418
@@ -428,7 +440,8 @@ Here are the available options:
428 sym-addr: 440 sym-addr:
429 bash-4000 [01] 1477.606694: simple_strtoul <c0339346> 441 bash-4000 [01] 1477.606694: simple_strtoul <c0339346>
430 442
431 verbose - This deals with the latency_trace file. 443 verbose - This deals with the trace file when the
444 latency-format option is enabled.
432 445
433 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ 446 bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
434 (+0.000ms): simple_strtoul (strict_strtoul) 447 (+0.000ms): simple_strtoul (strict_strtoul)
@@ -460,7 +473,7 @@ Here are the available options:
460 the app is no longer running 473 the app is no longer running
461 474
462 The lookup is performed when you read 475 The lookup is performed when you read
463 trace,trace_pipe,latency_trace. Example: 476 trace,trace_pipe. Example:
464 477
465 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 478 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
466x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] 479x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
@@ -469,6 +482,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
469 every scheduling event. Will add overhead if 482 every scheduling event. Will add overhead if
470 there's a lot of tasks running at once. 483 there's a lot of tasks running at once.
471 484
485 latency-format - This option changes the trace. When
486 it is enabled, the trace displays
487 additional information about the
488 latencies, as described in "Latency
489 trace format".
472 490
473sched_switch 491sched_switch
474------------ 492------------
@@ -476,11 +494,11 @@ sched_switch
476This tracer simply records schedule switches. Here is an example 494This tracer simply records schedule switches. Here is an example
477of how to use it. 495of how to use it.
478 496
479 # echo sched_switch > /debug/tracing/current_tracer 497 # echo sched_switch > current_tracer
480 # echo 1 > /debug/tracing/tracing_enabled 498 # echo 1 > tracing_enabled
481 # sleep 1 499 # sleep 1
482 # echo 0 > /debug/tracing/tracing_enabled 500 # echo 0 > tracing_enabled
483 # cat /debug/tracing/trace 501 # cat trace
484 502
485# tracer: sched_switch 503# tracer: sched_switch
486# 504#
@@ -583,13 +601,14 @@ new trace is saved.
583To reset the maximum, echo 0 into tracing_max_latency. Here is 601To reset the maximum, echo 0 into tracing_max_latency. Here is
584an example: 602an example:
585 603
586 # echo irqsoff > /debug/tracing/current_tracer 604 # echo irqsoff > current_tracer
587 # echo 0 > /debug/tracing/tracing_max_latency 605 # echo latency-format > trace_options
588 # echo 1 > /debug/tracing/tracing_enabled 606 # echo 0 > tracing_max_latency
607 # echo 1 > tracing_enabled
589 # ls -ltr 608 # ls -ltr
590 [...] 609 [...]
591 # echo 0 > /debug/tracing/tracing_enabled 610 # echo 0 > tracing_enabled
592 # cat /debug/tracing/latency_trace 611 # cat trace
593# tracer: irqsoff 612# tracer: irqsoff
594# 613#
595irqsoff latency trace v1.1.5 on 2.6.26 614irqsoff latency trace v1.1.5 on 2.6.26
@@ -690,13 +709,14 @@ Like the irqsoff tracer, it records the maximum latency for
690which preemption was disabled. The control of preemptoff tracer 709which preemption was disabled. The control of preemptoff tracer
691is much like the irqsoff tracer. 710is much like the irqsoff tracer.
692 711
693 # echo preemptoff > /debug/tracing/current_tracer 712 # echo preemptoff > current_tracer
694 # echo 0 > /debug/tracing/tracing_max_latency 713 # echo latency-format > trace_options
695 # echo 1 > /debug/tracing/tracing_enabled 714 # echo 0 > tracing_max_latency
715 # echo 1 > tracing_enabled
696 # ls -ltr 716 # ls -ltr
697 [...] 717 [...]
698 # echo 0 > /debug/tracing/tracing_enabled 718 # echo 0 > tracing_enabled
699 # cat /debug/tracing/latency_trace 719 # cat trace
700# tracer: preemptoff 720# tracer: preemptoff
701# 721#
702preemptoff latency trace v1.1.5 on 2.6.26-rc8 722preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -837,13 +857,14 @@ tracer.
837Again, using this trace is much like the irqsoff and preemptoff 857Again, using this trace is much like the irqsoff and preemptoff
838tracers. 858tracers.
839 859
840 # echo preemptirqsoff > /debug/tracing/current_tracer 860 # echo preemptirqsoff > current_tracer
841 # echo 0 > /debug/tracing/tracing_max_latency 861 # echo latency-format > trace_options
842 # echo 1 > /debug/tracing/tracing_enabled 862 # echo 0 > tracing_max_latency
863 # echo 1 > tracing_enabled
843 # ls -ltr 864 # ls -ltr
844 [...] 865 [...]
845 # echo 0 > /debug/tracing/tracing_enabled 866 # echo 0 > tracing_enabled
846 # cat /debug/tracing/latency_trace 867 # cat trace
847# tracer: preemptirqsoff 868# tracer: preemptirqsoff
848# 869#
849preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 870preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -999,12 +1020,13 @@ slightly differently than we did with the previous tracers.
999Instead of performing an 'ls', we will run 'sleep 1' under 1020Instead of performing an 'ls', we will run 'sleep 1' under
1000'chrt' which changes the priority of the task. 1021'chrt' which changes the priority of the task.
1001 1022
1002 # echo wakeup > /debug/tracing/current_tracer 1023 # echo wakeup > current_tracer
1003 # echo 0 > /debug/tracing/tracing_max_latency 1024 # echo latency-format > trace_options
1004 # echo 1 > /debug/tracing/tracing_enabled 1025 # echo 0 > tracing_max_latency
1026 # echo 1 > tracing_enabled
1005 # chrt -f 5 sleep 1 1027 # chrt -f 5 sleep 1
1006 # echo 0 > /debug/tracing/tracing_enabled 1028 # echo 0 > tracing_enabled
1007 # cat /debug/tracing/latency_trace 1029 # cat trace
1008# tracer: wakeup 1030# tracer: wakeup
1009# 1031#
1010wakeup latency trace v1.1.5 on 2.6.26-rc8 1032wakeup latency trace v1.1.5 on 2.6.26-rc8
@@ -1114,11 +1136,11 @@ can be done from the debug file system. Make sure the
1114ftrace_enabled is set; otherwise this tracer is a nop. 1136ftrace_enabled is set; otherwise this tracer is a nop.
1115 1137
1116 # sysctl kernel.ftrace_enabled=1 1138 # sysctl kernel.ftrace_enabled=1
1117 # echo function > /debug/tracing/current_tracer 1139 # echo function > current_tracer
1118 # echo 1 > /debug/tracing/tracing_enabled 1140 # echo 1 > tracing_enabled
1119 # usleep 1 1141 # usleep 1
1120 # echo 0 > /debug/tracing/tracing_enabled 1142 # echo 0 > tracing_enabled
1121 # cat /debug/tracing/trace 1143 # cat trace
1122# tracer: function 1144# tracer: function
1123# 1145#
1124# TASK-PID CPU# TIMESTAMP FUNCTION 1146# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1155,7 +1177,7 @@ int trace_fd;
1155[...] 1177[...]
1156int main(int argc, char *argv[]) { 1178int main(int argc, char *argv[]) {
1157 [...] 1179 [...]
1158 trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY); 1180 trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
1159 [...] 1181 [...]
1160 if (condition_hit()) { 1182 if (condition_hit()) {
1161 write(trace_fd, "0", 1); 1183 write(trace_fd, "0", 1);
@@ -1163,26 +1185,20 @@ int main(int argc, char *argv[]) {
1163 [...] 1185 [...]
1164} 1186}
1165 1187
1166Note: Here we hard coded the path name. The debugfs mount is not
1167guaranteed to be at /debug (and is more commonly at
1168/sys/kernel/debug). For simple one time traces, the above is
1169sufficent. For anything else, a search through /proc/mounts may
1170be needed to find where the debugfs file-system is mounted.
1171
1172 1188
1173Single thread tracing 1189Single thread tracing
1174--------------------- 1190---------------------
1175 1191
1176By writing into /debug/tracing/set_ftrace_pid you can trace a 1192By writing into set_ftrace_pid you can trace a
1177single thread. For example: 1193single thread. For example:
1178 1194
1179# cat /debug/tracing/set_ftrace_pid 1195# cat set_ftrace_pid
1180no pid 1196no pid
1181# echo 3111 > /debug/tracing/set_ftrace_pid 1197# echo 3111 > set_ftrace_pid
1182# cat /debug/tracing/set_ftrace_pid 1198# cat set_ftrace_pid
11833111 11993111
1184# echo function > /debug/tracing/current_tracer 1200# echo function > current_tracer
1185# cat /debug/tracing/trace | head 1201# cat trace | head
1186 # tracer: function 1202 # tracer: function
1187 # 1203 #
1188 # TASK-PID CPU# TIMESTAMP FUNCTION 1204 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1193,8 +1209,8 @@ no pid
1193 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel 1209 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
1194 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll 1210 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll
1195 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll 1211 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll
1196# echo -1 > /debug/tracing/set_ftrace_pid 1212# echo -1 > set_ftrace_pid
1197# cat /debug/tracing/trace |head 1213# cat trace |head
1198 # tracer: function 1214 # tracer: function
1199 # 1215 #
1200 # TASK-PID CPU# TIMESTAMP FUNCTION 1216 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1216,6 +1232,51 @@ something like this simple program:
1216#include <fcntl.h> 1232#include <fcntl.h>
1217#include <unistd.h> 1233#include <unistd.h>
1218 1234
1235#define _STR(x) #x
1236#define STR(x) _STR(x)
1237#define MAX_PATH 256
1238
1239const char *find_debugfs(void)
1240{
1241 static char debugfs[MAX_PATH+1];
1242 static int debugfs_found;
1243 char type[100];
1244 FILE *fp;
1245
1246 if (debugfs_found)
1247 return debugfs;
1248
1249 if ((fp = fopen("/proc/mounts","r")) == NULL) {
1250 perror("/proc/mounts");
1251 return NULL;
1252 }
1253
1254 while (fscanf(fp, "%*s %"
1255 STR(MAX_PATH)
1256 "s %99s %*s %*d %*d\n",
1257 debugfs, type) == 2) {
1258 if (strcmp(type, "debugfs") == 0)
1259 break;
1260 }
1261 fclose(fp);
1262
1263 if (strcmp(type, "debugfs") != 0) {
1264 fprintf(stderr, "debugfs not mounted");
1265 return NULL;
1266 }
1267
1268 debugfs_found = 1;
1269
1270 return debugfs;
1271}
1272
1273const char *tracing_file(const char *file_name)
1274{
1275 static char trace_file[MAX_PATH+1];
1276 snprintf(trace_file, MAX_PATH, "%s/%s", find_debugfs(), file_name);
1277 return trace_file;
1278}
1279
1219int main (int argc, char **argv) 1280int main (int argc, char **argv)
1220{ 1281{
1221 if (argc < 1) 1282 if (argc < 1)
@@ -1226,12 +1287,12 @@ int main (int argc, char **argv)
1226 char line[64]; 1287 char line[64];
1227 int s; 1288 int s;
1228 1289
1229 ffd = open("/debug/tracing/current_tracer", O_WRONLY); 1290 ffd = open(tracing_file("current_tracer"), O_WRONLY);
1230 if (ffd < 0) 1291 if (ffd < 0)
1231 exit(-1); 1292 exit(-1);
1232 write(ffd, "nop", 3); 1293 write(ffd, "nop", 3);
1233 1294
1234 fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); 1295 fd = open(tracing_file("set_ftrace_pid"), O_WRONLY);
1235 s = sprintf(line, "%d\n", getpid()); 1296 s = sprintf(line, "%d\n", getpid());
1236 write(fd, line, s); 1297 write(fd, line, s);
1237 1298
@@ -1383,22 +1444,22 @@ want, depending on your needs.
1383 tracing_cpu_mask file) or you might sometimes see unordered 1444 tracing_cpu_mask file) or you might sometimes see unordered
1384 function calls while cpu tracing switch. 1445 function calls while cpu tracing switch.
1385 1446
1386 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options 1447 hide: echo nofuncgraph-cpu > trace_options
1387 show: echo funcgraph-cpu > /debug/tracing/trace_options 1448 show: echo funcgraph-cpu > trace_options
1388 1449
1389- The duration (function's time of execution) is displayed on 1450- The duration (function's time of execution) is displayed on
1390 the closing bracket line of a function or on the same line 1451 the closing bracket line of a function or on the same line
1391 than the current function in case of a leaf one. It is default 1452 than the current function in case of a leaf one. It is default
1392 enabled. 1453 enabled.
1393 1454
1394 hide: echo nofuncgraph-duration > /debug/tracing/trace_options 1455 hide: echo nofuncgraph-duration > trace_options
1395 show: echo funcgraph-duration > /debug/tracing/trace_options 1456 show: echo funcgraph-duration > trace_options
1396 1457
1397- The overhead field precedes the duration field in case of 1458- The overhead field precedes the duration field in case of
1398 reached duration thresholds. 1459 reached duration thresholds.
1399 1460
1400 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options 1461 hide: echo nofuncgraph-overhead > trace_options
1401 show: echo funcgraph-overhead > /debug/tracing/trace_options 1462 show: echo funcgraph-overhead > trace_options
1402 depends on: funcgraph-duration 1463 depends on: funcgraph-duration
1403 1464
1404 ie: 1465 ie:
@@ -1427,8 +1488,8 @@ want, depending on your needs.
1427- The task/pid field displays the thread cmdline and pid which 1488- The task/pid field displays the thread cmdline and pid which
1428 executed the function. It is default disabled. 1489 executed the function. It is default disabled.
1429 1490
1430 hide: echo nofuncgraph-proc > /debug/tracing/trace_options 1491 hide: echo nofuncgraph-proc > trace_options
1431 show: echo funcgraph-proc > /debug/tracing/trace_options 1492 show: echo funcgraph-proc > trace_options
1432 1493
1433 ie: 1494 ie:
1434 1495
@@ -1451,8 +1512,8 @@ want, depending on your needs.
1451 system clock since it started. A snapshot of this time is 1512 system clock since it started. A snapshot of this time is
1452 given on each entry/exit of functions 1513 given on each entry/exit of functions
1453 1514
1454 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options 1515 hide: echo nofuncgraph-abstime > trace_options
1455 show: echo funcgraph-abstime > /debug/tracing/trace_options 1516 show: echo funcgraph-abstime > trace_options
1456 1517
1457 ie: 1518 ie:
1458 1519
@@ -1549,7 +1610,7 @@ listed in:
1549 1610
1550 available_filter_functions 1611 available_filter_functions
1551 1612
1552 # cat /debug/tracing/available_filter_functions 1613 # cat available_filter_functions
1553put_prev_task_idle 1614put_prev_task_idle
1554kmem_cache_create 1615kmem_cache_create
1555pick_next_task_rt 1616pick_next_task_rt
@@ -1561,12 +1622,12 @@ mutex_lock
1561If I am only interested in sys_nanosleep and hrtimer_interrupt: 1622If I am only interested in sys_nanosleep and hrtimer_interrupt:
1562 1623
1563 # echo sys_nanosleep hrtimer_interrupt \ 1624 # echo sys_nanosleep hrtimer_interrupt \
1564 > /debug/tracing/set_ftrace_filter 1625 > set_ftrace_filter
1565 # echo ftrace > /debug/tracing/current_tracer 1626 # echo ftrace > current_tracer
1566 # echo 1 > /debug/tracing/tracing_enabled 1627 # echo 1 > tracing_enabled
1567 # usleep 1 1628 # usleep 1
1568 # echo 0 > /debug/tracing/tracing_enabled 1629 # echo 0 > tracing_enabled
1569 # cat /debug/tracing/trace 1630 # cat trace
1570# tracer: ftrace 1631# tracer: ftrace
1571# 1632#
1572# TASK-PID CPU# TIMESTAMP FUNCTION 1633# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1577,7 +1638,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
1577 1638
1578To see which functions are being traced, you can cat the file: 1639To see which functions are being traced, you can cat the file:
1579 1640
1580 # cat /debug/tracing/set_ftrace_filter 1641 # cat set_ftrace_filter
1581hrtimer_interrupt 1642hrtimer_interrupt
1582sys_nanosleep 1643sys_nanosleep
1583 1644
@@ -1597,7 +1658,7 @@ Note: It is better to use quotes to enclose the wild cards,
1597 otherwise the shell may expand the parameters into names 1658 otherwise the shell may expand the parameters into names
1598 of files in the local directory. 1659 of files in the local directory.
1599 1660
1600 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1661 # echo 'hrtimer_*' > set_ftrace_filter
1601 1662
1602Produces: 1663Produces:
1603 1664
@@ -1618,7 +1679,7 @@ Produces:
1618 1679
1619Notice that we lost the sys_nanosleep. 1680Notice that we lost the sys_nanosleep.
1620 1681
1621 # cat /debug/tracing/set_ftrace_filter 1682 # cat set_ftrace_filter
1622hrtimer_run_queues 1683hrtimer_run_queues
1623hrtimer_run_pending 1684hrtimer_run_pending
1624hrtimer_init 1685hrtimer_init
@@ -1644,17 +1705,17 @@ To append to the filters, use '>>'
1644To clear out a filter so that all functions will be recorded 1705To clear out a filter so that all functions will be recorded
1645again: 1706again:
1646 1707
1647 # echo > /debug/tracing/set_ftrace_filter 1708 # echo > set_ftrace_filter
1648 # cat /debug/tracing/set_ftrace_filter 1709 # cat set_ftrace_filter
1649 # 1710 #
1650 1711
1651Again, now we want to append. 1712Again, now we want to append.
1652 1713
1653 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter 1714 # echo sys_nanosleep > set_ftrace_filter
1654 # cat /debug/tracing/set_ftrace_filter 1715 # cat set_ftrace_filter
1655sys_nanosleep 1716sys_nanosleep
1656 # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter 1717 # echo 'hrtimer_*' >> set_ftrace_filter
1657 # cat /debug/tracing/set_ftrace_filter 1718 # cat set_ftrace_filter
1658hrtimer_run_queues 1719hrtimer_run_queues
1659hrtimer_run_pending 1720hrtimer_run_pending
1660hrtimer_init 1721hrtimer_init
@@ -1677,7 +1738,7 @@ hrtimer_init_sleeper
1677The set_ftrace_notrace prevents those functions from being 1738The set_ftrace_notrace prevents those functions from being
1678traced. 1739traced.
1679 1740
1680 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1741 # echo '*preempt*' '*lock*' > set_ftrace_notrace
1681 1742
1682Produces: 1743Produces:
1683 1744
@@ -1767,13 +1828,13 @@ the effect on the tracing is different. Every read from
1767trace_pipe is consumed. This means that subsequent reads will be 1828trace_pipe is consumed. This means that subsequent reads will be
1768different. The trace is live. 1829different. The trace is live.
1769 1830
1770 # echo function > /debug/tracing/current_tracer 1831 # echo function > current_tracer
1771 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1832 # cat trace_pipe > /tmp/trace.out &
1772[1] 4153 1833[1] 4153
1773 # echo 1 > /debug/tracing/tracing_enabled 1834 # echo 1 > tracing_enabled
1774 # usleep 1 1835 # usleep 1
1775 # echo 0 > /debug/tracing/tracing_enabled 1836 # echo 0 > tracing_enabled
1776 # cat /debug/tracing/trace 1837 # cat trace
1777# tracer: function 1838# tracer: function
1778# 1839#
1779# TASK-PID CPU# TIMESTAMP FUNCTION 1840# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1809,7 +1870,7 @@ number listed is the number of entries that can be recorded per
1809CPU. To know the full size, multiply the number of possible CPUS 1870CPU. To know the full size, multiply the number of possible CPUS
1810with the number of entries. 1871with the number of entries.
1811 1872
1812 # cat /debug/tracing/buffer_size_kb 1873 # cat buffer_size_kb
18131408 (units kilobytes) 18741408 (units kilobytes)
1814 1875
1815Note, to modify this, you must have tracing completely disabled. 1876Note, to modify this, you must have tracing completely disabled.
@@ -1817,18 +1878,18 @@ To do that, echo "nop" into the current_tracer. If the
1817current_tracer is not set to "nop", an EINVAL error will be 1878current_tracer is not set to "nop", an EINVAL error will be
1818returned. 1879returned.
1819 1880
1820 # echo nop > /debug/tracing/current_tracer 1881 # echo nop > current_tracer
1821 # echo 10000 > /debug/tracing/buffer_size_kb 1882 # echo 10000 > buffer_size_kb
1822 # cat /debug/tracing/buffer_size_kb 1883 # cat buffer_size_kb
182310000 (units kilobytes) 188410000 (units kilobytes)
1824 1885
1825The number of pages which will be allocated is limited to a 1886The number of pages which will be allocated is limited to a
1826percentage of available memory. Allocating too much will produce 1887percentage of available memory. Allocating too much will produce
1827an error. 1888an error.
1828 1889
1829 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1890 # echo 1000000000000 > buffer_size_kb
1830-bash: echo: write error: Cannot allocate memory 1891-bash: echo: write error: Cannot allocate memory
1831 # cat /debug/tracing/buffer_size_kb 1892 # cat buffer_size_kb
183285 189385
1833 1894
1834----------- 1895-----------
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim
new file mode 100644
index 000000000000..0544b504c8b0
--- /dev/null
+++ b/Documentation/trace/function-graph-fold.vim
@@ -0,0 +1,42 @@
1" Enable folding for ftrace function_graph traces.
2"
3" To use, :source this file while viewing a function_graph trace, or use vim's
4" -S option to load from the command-line together with a trace. You can then
5" use the usual vim fold commands, such as "za", to open and close nested
6" functions. While closed, a fold will show the total time taken for a call,
7" as would normally appear on the line with the closing brace. Folded
8" functions will not include finish_task_switch(), so folding should remain
9" relatively sane even through a context switch.
10"
11" Note that this will almost certainly only work well with a
12" single-CPU trace (e.g. trace-cmd report --cpu 1).
13
14function! FunctionGraphFoldExpr(lnum)
15 let line = getline(a:lnum)
16 if line[-1:] == '{'
17 if line =~ 'finish_task_switch() {$'
18 return '>1'
19 endif
20 return 'a1'
21 elseif line[-1:] == '}'
22 return 's1'
23 else
24 return '='
25 endif
26endfunction
27
28function! FunctionGraphFoldText()
29 let s = split(getline(v:foldstart), '|', 1)
30 if getline(v:foldend+1) =~ 'finish_task_switch() {$'
31 let s[2] = ' task switch '
32 else
33 let e = split(getline(v:foldend), '|', 1)
34 let s[2] = e[2]
35 endif
36 return join(s, '|')
37endfunction
38
39setlocal foldexpr=FunctionGraphFoldExpr(v:lnum)
40setlocal foldtext=FunctionGraphFoldText()
41setlocal foldcolumn=12
42setlocal foldmethod=expr
diff --git a/Documentation/trace/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..162effbfbdec 100644
--- a/Documentation/trace/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
@@ -32,41 +32,41 @@ is no way to automatically detect if you are losing events due to CPUs racing.
32Usage Quick Reference 32Usage Quick Reference
33--------------------- 33---------------------
34 34
35$ mount -t debugfs debugfs /debug 35$ mount -t debugfs debugfs /sys/kernel/debug
36$ echo mmiotrace > /debug/tracing/current_tracer 36$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
37$ cat /debug/tracing/trace_pipe > mydump.txt & 37$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
38Start X or whatever. 38Start X or whatever.
39$ echo "X is up" > /debug/tracing/trace_marker 39$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
40$ echo nop > /debug/tracing/current_tracer 40$ echo nop > /sys/kernel/debug/tracing/current_tracer
41Check for lost events. 41Check for lost events.
42 42
43 43
44Usage 44Usage
45----- 45-----
46 46
47Make sure debugfs is mounted to /debug. If not, (requires root privileges) 47Make sure debugfs is mounted to /sys/kernel/debug. If not, (requires root privileges)
48$ mount -t debugfs debugfs /debug 48$ mount -t debugfs debugfs /sys/kernel/debug
49 49
50Check that the driver you are about to trace is not loaded. 50Check that the driver you are about to trace is not loaded.
51 51
52Activate mmiotrace (requires root privileges): 52Activate mmiotrace (requires root privileges):
53$ echo mmiotrace > /debug/tracing/current_tracer 53$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
54 54
55Start storing the trace: 55Start storing the trace:
56$ cat /debug/tracing/trace_pipe > mydump.txt & 56$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
57The 'cat' process should stay running (sleeping) in the background. 57The 'cat' process should stay running (sleeping) in the background.
58 58
59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO 59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
60accesses to areas that are ioremapped while mmiotrace is active. 60accesses to areas that are ioremapped while mmiotrace is active.
61 61
62During tracing you can place comments (markers) into the trace by 62During tracing you can place comments (markers) into the trace by
63$ echo "X is up" > /debug/tracing/trace_marker 63$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
64This makes it easier to see which part of the (huge) trace corresponds to 64This makes it easier to see which part of the (huge) trace corresponds to
65which action. It is recommended to place descriptive markers about what you 65which action. It is recommended to place descriptive markers about what you
66do. 66do.
67 67
68Shut down mmiotrace (requires root privileges): 68Shut down mmiotrace (requires root privileges):
69$ echo nop > /debug/tracing/current_tracer 69$ echo nop > /sys/kernel/debug/tracing/current_tracer
70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and 70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
71pressing ctrl+c. 71pressing ctrl+c.
72 72
@@ -78,10 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If
78events were lost, the trace is incomplete. You should enlarge the buffers and 78events were lost, the trace is incomplete. You should enlarge the buffers and
79try again. Buffers are enlarged by first seeing how large the current buffers 79try again. Buffers are enlarged by first seeing how large the current buffers
80are: 80are:
81$ cat /debug/tracing/buffer_size_kb 81$ cat /sys/kernel/debug/tracing/buffer_size_kb
82gives you a number. Approximately double this number and write it back, for 82gives you a number. Approximately double this number and write it back, for
83instance: 83instance:
84$ echo 128000 > /debug/tracing/buffer_size_kb 84$ echo 128000 > /sys/kernel/debug/tracing/buffer_size_kb
85Then start again from the top. 85Then start again from the top.
86 86
87If you are doing a trace for a driver project, e.g. Nouveau, you should also 87If you are doing a trace for a driver project, e.g. Nouveau, you should also
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644
index 000000000000..5b1d23d604c5
--- /dev/null
+++ b/Documentation/trace/ring-buffer-design.txt
@@ -0,0 +1,955 @@
1 Lockless Ring Buffer Design
2 ===========================
3
4Copyright 2009 Red Hat Inc.
5 Author: Steven Rostedt <srostedt@redhat.com>
6 License: The GNU Free Documentation License, Version 1.2
7 (dual licensed under the GPL v2)
8Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
9 and Frederic Weisbecker.
10
11
12Written for: 2.6.31
13
14Terminology used in this Document
15---------------------------------
16
17tail - where new writes happen in the ring buffer.
18
19head - where new reads happen in the ring buffer.
20
21producer - the task that writes into the ring buffer (same as writer)
22
23writer - same as producer
24
25consumer - the task that reads from the buffer (same as reader)
26
27reader - same as consumer.
28
29reader_page - A page outside the ring buffer used solely (for the most part)
30 by the reader.
31
32head_page - a pointer to the page that the reader will use next
33
34tail_page - a pointer to the page that will be written to next
35
36commit_page - a pointer to the page with the last finished non nested write.
37
38cmpxchg - hardware assisted atomic transaction that performs the following:
39
40 A = B iff previous A == C
41
42 R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
43 current A is equal to C, and we put the old (current) A into R
44
45 R gets the previous A regardless if A is updated with B or not.
46
47 To see if the update was successful a compare of R == C may be used.
48
49The Generic Ring Buffer
50-----------------------
51
52The ring buffer can be used in either an overwrite mode or in
53producer/consumer mode.
54
55Producer/consumer mode is where the producer were to fill up the
56buffer before the consumer could free up anything, the producer
57will stop writing to the buffer. This will lose most recent events.
58
59Overwrite mode is where the produce were to fill up the buffer
60before the consumer could free up anything, the producer will
61overwrite the older data. This will lose the oldest events.
62
63No two writers can write at the same time (on the same per cpu buffer),
64but a writer may interrupt another writer, but it must finish writing
65before the previous writer may continue. This is very important to the
66algorithm. The writers act like a "stack". The way interrupts works
67enforces this behavior.
68
69
70 writer1 start
71 <preempted> writer2 start
72 <preempted> writer3 start
73 writer3 finishes
74 writer2 finishes
75 writer1 finishes
76
77This is very much like a writer being preempted by an interrupt and
78the interrupt doing a write as well.
79
80Readers can happen at any time. But no two readers may run at the
81same time, nor can a reader preempt/interrupt another reader. A reader
82can not preempt/interrupt a writer, but it may read/consume from the
83buffer at the same time as a writer is writing, but the reader must be
84on another processor to do so. A reader may read on its own processor
85and can be preempted by a writer.
86
87A writer can preempt a reader, but a reader can not preempt a writer.
88But a reader can read the buffer at the same time (on another processor)
89as a writer.
90
91The ring buffer is made up of a list of pages held together by a link list.
92
93At initialization a reader page is allocated for the reader that is not
94part of the ring buffer.
95
96The head_page, tail_page and commit_page are all initialized to point
97to the same page.
98
99The reader page is initialized to have its next pointer pointing to
100the head page, and its previous pointer pointing to a page before
101the head page.
102
103The reader has its own page to use. At start up time, this page is
104allocated but is not attached to the list. When the reader wants
105to read from the buffer, if its page is empty (like it is on start up)
106it will swap its page with the head_page. The old reader page will
107become part of the ring buffer and the head_page will be removed.
108The page after the inserted page (old reader_page) will become the
109new head page.
110
111Once the new page is given to the reader, the reader could do what
112it wants with it, as long as a writer has left that page.
113
114A sample of how the reader page is swapped: Note this does not
115show the head page in the buffer, it is for demonstrating a swap
116only.
117
118 +------+
119 |reader| RING BUFFER
120 |page |
121 +------+
122 +---+ +---+ +---+
123 | |-->| |-->| |
124 | |<--| |<--| |
125 +---+ +---+ +---+
126 ^ | ^ |
127 | +-------------+ |
128 +-----------------+
129
130
131 +------+
132 |reader| RING BUFFER
133 |page |-------------------+
134 +------+ v
135 | +---+ +---+ +---+
136 | | |-->| |-->| |
137 | | |<--| |<--| |<-+
138 | +---+ +---+ +---+ |
139 | ^ | ^ | |
140 | | +-------------+ | |
141 | +-----------------+ |
142 +------------------------------------+
143
144 +------+
145 |reader| RING BUFFER
146 |page |-------------------+
147 +------+ <---------------+ v
148 | ^ +---+ +---+ +---+
149 | | | |-->| |-->| |
150 | | | | | |<--| |<-+
151 | | +---+ +---+ +---+ |
152 | | | ^ | |
153 | | +-------------+ | |
154 | +-----------------------------+ |
155 +------------------------------------+
156
157 +------+
158 |buffer| RING BUFFER
159 |page |-------------------+
160 +------+ <---------------+ v
161 | ^ +---+ +---+ +---+
162 | | | | | |-->| |
163 | | New | | | |<--| |<-+
164 | | Reader +---+ +---+ +---+ |
165 | | page ----^ | |
166 | | | |
167 | +-----------------------------+ |
168 +------------------------------------+
169
170
171
172It is possible that the page swapped is the commit page and the tail page,
173if what is in the ring buffer is less than what is held in a buffer page.
174
175
176 reader page commit page tail page
177 | | |
178 v | |
179 +---+ | |
180 | |<----------+ |
181 | |<------------------------+
182 | |------+
183 +---+ |
184 |
185 v
186 +---+ +---+ +---+ +---+
187<---| |--->| |--->| |--->| |--->
188--->| |<---| |<---| |<---| |<---
189 +---+ +---+ +---+ +---+
190
191This case is still valid for this algorithm.
192When the writer leaves the page, it simply goes into the ring buffer
193since the reader page still points to the next location in the ring
194buffer.
195
196
197The main pointers:
198
199 reader page - The page used solely by the reader and is not part
200 of the ring buffer (may be swapped in)
201
202 head page - the next page in the ring buffer that will be swapped
203 with the reader page.
204
205 tail page - the page where the next write will take place.
206
207 commit page - the page that last finished a write.
208
209The commit page only is updated by the outer most writer in the
210writer stack. A writer that preempts another writer will not move the
211commit page.
212
213When data is written into the ring buffer, a position is reserved
214in the ring buffer and passed back to the writer. When the writer
215is finished writing data into that position, it commits the write.
216
217Another write (or a read) may take place at anytime during this
218transaction. If another write happens it must finish before continuing
219with the previous write.
220
221
222 Write reserve:
223
224 Buffer page
225 +---------+
226 |written |
227 +---------+ <--- given back to writer (current commit)
228 |reserved |
229 +---------+ <--- tail pointer
230 | empty |
231 +---------+
232
233 Write commit:
234
235 Buffer page
236 +---------+
237 |written |
238 +---------+
239 |written |
240 +---------+ <--- next positon for write (current commit)
241 | empty |
242 +---------+
243
244
245 If a write happens after the first reserve:
246
247 Buffer page
248 +---------+
249 |written |
250 +---------+ <-- current commit
251 |reserved |
252 +---------+ <--- given back to second writer
253 |reserved |
254 +---------+ <--- tail pointer
255
256 After second writer commits:
257
258
259 Buffer page
260 +---------+
261 |written |
262 +---------+ <--(last full commit)
263 |reserved |
264 +---------+
265 |pending |
266 |commit |
267 +---------+ <--- tail pointer
268
269 When the first writer commits:
270
271 Buffer page
272 +---------+
273 |written |
274 +---------+
275 |written |
276 +---------+
277 |written |
278 +---------+ <--(last full commit and tail pointer)
279
280
281The commit pointer points to the last write location that was
282committed without preempting another write. When a write that
283preempted another write is committed, it only becomes a pending commit
284and will not be a full commit till all writes have been committed.
285
286The commit page points to the page that has the last full commit.
287The tail page points to the page with the last write (before
288committing).
289
290The tail page is always equal to or after the commit page. It may
291be several pages ahead. If the tail page catches up to the commit
292page then no more writes may take place (regardless of the mode
293of the ring buffer: overwrite and produce/consumer).
294
295The order of pages are:
296
297 head page
298 commit page
299 tail page
300
301Possible scenario:
302 tail page
303 head page commit page |
304 | | |
305 v v v
306 +---+ +---+ +---+ +---+
307<---| |--->| |--->| |--->| |--->
308--->| |<---| |<---| |<---| |<---
309 +---+ +---+ +---+ +---+
310
311There is a special case that the head page is after either the commit page
312and possibly the tail page. That is when the commit (and tail) page has been
313swapped with the reader page. This is because the head page is always
314part of the ring buffer, but the reader page is not. When ever there
315has been less than a full page that has been committed inside the ring buffer,
316and a reader swaps out a page, it will be swapping out the commit page.
317
318
319 reader page commit page tail page
320 | | |
321 v | |
322 +---+ | |
323 | |<----------+ |
324 | |<------------------------+
325 | |------+
326 +---+ |
327 |
328 v
329 +---+ +---+ +---+ +---+
330<---| |--->| |--->| |--->| |--->
331--->| |<---| |<---| |<---| |<---
332 +---+ +---+ +---+ +---+
333 ^
334 |
335 head page
336
337
338In this case, the head page will not move when the tail and commit
339move back into the ring buffer.
340
341The reader can not swap a page into the ring buffer if the commit page
342is still on that page. If the read meets the last commit (real commit
343not pending or reserved), then there is nothing more to read.
344The buffer is considered empty until another full commit finishes.
345
346When the tail meets the head page, if the buffer is in overwrite mode,
347the head page will be pushed ahead one. If the buffer is in producer/consumer
348mode, the write will fail.
349
350Overwrite mode:
351
352 tail page
353 |
354 v
355 +---+ +---+ +---+ +---+
356<---| |--->| |--->| |--->| |--->
357--->| |<---| |<---| |<---| |<---
358 +---+ +---+ +---+ +---+
359 ^
360 |
361 head page
362
363
364 tail page
365 |
366 v
367 +---+ +---+ +---+ +---+
368<---| |--->| |--->| |--->| |--->
369--->| |<---| |<---| |<---| |<---
370 +---+ +---+ +---+ +---+
371 ^
372 |
373 head page
374
375
376 tail page
377 |
378 v
379 +---+ +---+ +---+ +---+
380<---| |--->| |--->| |--->| |--->
381--->| |<---| |<---| |<---| |<---
382 +---+ +---+ +---+ +---+
383 ^
384 |
385 head page
386
387Note, the reader page will still point to the previous head page.
388But when a swap takes place, it will use the most recent head page.
389
390
391Making the Ring Buffer Lockless:
392--------------------------------
393
394The main idea behind the lockless algorithm is to combine the moving
395of the head_page pointer with the swapping of pages with the reader.
396State flags are placed inside the pointer to the page. To do this,
397each page must be aligned in memory by 4 bytes. This will allow the 2
398least significant bits of the address to be used as flags. Since
399they will always be zero for the address. To get the address,
400simply mask out the flags.
401
402 MASK = ~3
403
404 address & MASK
405
406Two flags will be kept by these two bits:
407
408 HEADER - the page being pointed to is a head page
409
410 UPDATE - the page being pointed to is being updated by a writer
411 and was or is about to be a head page.
412
413
414 reader page
415 |
416 v
417 +---+
418 | |------+
419 +---+ |
420 |
421 v
422 +---+ +---+ +---+ +---+
423<---| |--->| |-H->| |--->| |--->
424--->| |<---| |<---| |<---| |<---
425 +---+ +---+ +---+ +---+
426
427
428The above pointer "-H->" would have the HEADER flag set. That is
429the next page is the next page to be swapped out by the reader.
430This pointer means the next page is the head page.
431
432When the tail page meets the head pointer, it will use cmpxchg to
433change the pointer to the UPDATE state:
434
435
436 tail page
437 |
438 v
439 +---+ +---+ +---+ +---+
440<---| |--->| |-H->| |--->| |--->
441--->| |<---| |<---| |<---| |<---
442 +---+ +---+ +---+ +---+
443
444 tail page
445 |
446 v
447 +---+ +---+ +---+ +---+
448<---| |--->| |-U->| |--->| |--->
449--->| |<---| |<---| |<---| |<---
450 +---+ +---+ +---+ +---+
451
452"-U->" represents a pointer in the UPDATE state.
453
454Any access to the reader will need to take some sort of lock to serialize
455the readers. But the writers will never take a lock to write to the
456ring buffer. This means we only need to worry about a single reader,
457and writes only preempt in "stack" formation.
458
459When the reader tries to swap the page with the ring buffer, it
460will also use cmpxchg. If the flag bit in the pointer to the
461head page does not have the HEADER flag set, the compare will fail
462and the reader will need to look for the new head page and try again.
463Note, the flag UPDATE and HEADER are never set at the same time.
464
465The reader swaps the reader page as follows:
466
467 +------+
468 |reader| RING BUFFER
469 |page |
470 +------+
471 +---+ +---+ +---+
472 | |--->| |--->| |
473 | |<---| |<---| |
474 +---+ +---+ +---+
475 ^ | ^ |
476 | +---------------+ |
477 +-----H-------------+
478
479The reader sets the reader page next pointer as HEADER to the page after
480the head page.
481
482
483 +------+
484 |reader| RING BUFFER
485 |page |-------H-----------+
486 +------+ v
487 | +---+ +---+ +---+
488 | | |--->| |--->| |
489 | | |<---| |<---| |<-+
490 | +---+ +---+ +---+ |
491 | ^ | ^ | |
492 | | +---------------+ | |
493 | +-----H-------------+ |
494 +--------------------------------------+
495
496It does a cmpxchg with the pointer to the previous head page to make it
497point to the reader page. Note that the new pointer does not have the HEADER
498flag set. This action atomically moves the head page forward.
499
500 +------+
501 |reader| RING BUFFER
502 |page |-------H-----------+
503 +------+ v
504 | ^ +---+ +---+ +---+
505 | | | |-->| |-->| |
506 | | | |<--| |<--| |<-+
507 | | +---+ +---+ +---+ |
508 | | | ^ | |
509 | | +-------------+ | |
510 | +-----------------------------+ |
511 +------------------------------------+
512
513After the new head page is set, the previous pointer of the head page is
514updated to the reader page.
515
516 +------+
517 |reader| RING BUFFER
518 |page |-------H-----------+
519 +------+ <---------------+ v
520 | ^ +---+ +---+ +---+
521 | | | |-->| |-->| |
522 | | | | | |<--| |<-+
523 | | +---+ +---+ +---+ |
524 | | | ^ | |
525 | | +-------------+ | |
526 | +-----------------------------+ |
527 +------------------------------------+
528
529 +------+
530 |buffer| RING BUFFER
531 |page |-------H-----------+ <--- New head page
532 +------+ <---------------+ v
533 | ^ +---+ +---+ +---+
534 | | | | | |-->| |
535 | | New | | | |<--| |<-+
536 | | Reader +---+ +---+ +---+ |
537 | | page ----^ | |
538 | | | |
539 | +-----------------------------+ |
540 +------------------------------------+
541
542Another important point. The page that the reader page points back to
543by its previous pointer (the one that now points to the new head page)
544never points back to the reader page. That is because the reader page is
545not part of the ring buffer. Traversing the ring buffer via the next pointers
546will always stay in the ring buffer. Traversing the ring buffer via the
547prev pointers may not.
548
549Note, the way to determine a reader page is simply by examining the previous
550pointer of the page. If the next pointer of the previous page does not
551point back to the original page, then the original page is a reader page:
552
553
554 +--------+
555 | reader | next +----+
556 | page |-------->| |<====== (buffer page)
557 +--------+ +----+
558 | | ^
559 | v | next
560 prev | +----+
561 +------------->| |
562 +----+
563
564The way the head page moves forward:
565
566When the tail page meets the head page and the buffer is in overwrite mode
567and more writes take place, the head page must be moved forward before the
568writer may move the tail page. The way this is done is that the writer
569performs a cmpxchg to convert the pointer to the head page from the HEADER
570flag to have the UPDATE flag set. Once this is done, the reader will
571not be able to swap the head page from the buffer, nor will it be able to
572move the head page, until the writer is finished with the move.
573
574This eliminates any races that the reader can have on the writer. The reader
575must spin, and this is why the reader can not preempt the writer.
576
577 tail page
578 |
579 v
580 +---+ +---+ +---+ +---+
581<---| |--->| |-H->| |--->| |--->
582--->| |<---| |<---| |<---| |<---
583 +---+ +---+ +---+ +---+
584
585 tail page
586 |
587 v
588 +---+ +---+ +---+ +---+
589<---| |--->| |-U->| |--->| |--->
590--->| |<---| |<---| |<---| |<---
591 +---+ +---+ +---+ +---+
592
593The following page will be made into the new head page.
594
595 tail page
596 |
597 v
598 +---+ +---+ +---+ +---+
599<---| |--->| |-U->| |-H->| |--->
600--->| |<---| |<---| |<---| |<---
601 +---+ +---+ +---+ +---+
602
603After the new head page has been set, we can set the old head page
604pointer back to NORMAL.
605
606 tail page
607 |
608 v
609 +---+ +---+ +---+ +---+
610<---| |--->| |--->| |-H->| |--->
611--->| |<---| |<---| |<---| |<---
612 +---+ +---+ +---+ +---+
613
614After the head page has been moved, the tail page may now move forward.
615
616 tail page
617 |
618 v
619 +---+ +---+ +---+ +---+
620<---| |--->| |--->| |-H->| |--->
621--->| |<---| |<---| |<---| |<---
622 +---+ +---+ +---+ +---+
623
624
625The above are the trivial updates. Now for the more complex scenarios.
626
627
628As stated before, if enough writes preempt the first write, the
629tail page may make it all the way around the buffer and meet the commit
630page. At this time, we must start dropping writes (usually with some kind
631of warning to the user). But what happens if the commit was still on the
632reader page? The commit page is not part of the ring buffer. The tail page
633must account for this.
634
635
636 reader page commit page
637 | |
638 v |
639 +---+ |
640 | |<----------+
641 | |
642 | |------+
643 +---+ |
644 |
645 v
646 +---+ +---+ +---+ +---+
647<---| |--->| |-H->| |--->| |--->
648--->| |<---| |<---| |<---| |<---
649 +---+ +---+ +---+ +---+
650 ^
651 |
652 tail page
653
654If the tail page were to simply push the head page forward, the commit when
655leaving the reader page would not be pointing to the correct page.
656
657The solution to this is to test if the commit page is on the reader page
658before pushing the head page. If it is, then it can be assumed that the
659tail page wrapped the buffer, and we must drop new writes.
660
661This is not a race condition, because the commit page can only be moved
662by the outter most writer (the writer that was preempted).
663This means that the commit will not move while a writer is moving the
664tail page. The reader can not swap the reader page if it is also being
665used as the commit page. The reader can simply check that the commit
666is off the reader page. Once the commit page leaves the reader page
667it will never go back on it unless a reader does another swap with the
668buffer page that is also the commit page.
669
670
671Nested writes
672-------------
673
674In the pushing forward of the tail page we must first push forward
675the head page if the head page is the next page. If the head page
676is not the next page, the tail page is simply updated with a cmpxchg.
677
678Only writers move the tail page. This must be done atomically to protect
679against nested writers.
680
681 temp_page = tail_page
682 next_page = temp_page->next
683 cmpxchg(tail_page, temp_page, next_page)
684
685The above will update the tail page if it is still pointing to the expected
686page. If this fails, a nested write pushed it forward, the the current write
687does not need to push it.
688
689
690 temp page
691 |
692 v
693 tail page
694 |
695 v
696 +---+ +---+ +---+ +---+
697<---| |--->| |--->| |--->| |--->
698--->| |<---| |<---| |<---| |<---
699 +---+ +---+ +---+ +---+
700
701Nested write comes in and moves the tail page forward:
702
703 tail page (moved by nested writer)
704 temp page |
705 | |
706 v v
707 +---+ +---+ +---+ +---+
708<---| |--->| |--->| |--->| |--->
709--->| |<---| |<---| |<---| |<---
710 +---+ +---+ +---+ +---+
711
712The above would fail the cmpxchg, but since the tail page has already
713been moved forward, the writer will just try again to reserve storage
714on the new tail page.
715
716But the moving of the head page is a bit more complex.
717
718 tail page
719 |
720 v
721 +---+ +---+ +---+ +---+
722<---| |--->| |-H->| |--->| |--->
723--->| |<---| |<---| |<---| |<---
724 +---+ +---+ +---+ +---+
725
726The write converts the head page pointer to UPDATE.
727
728 tail page
729 |
730 v
731 +---+ +---+ +---+ +---+
732<---| |--->| |-U->| |--->| |--->
733--->| |<---| |<---| |<---| |<---
734 +---+ +---+ +---+ +---+
735
736But if a nested writer preempts here. It will see that the next
737page is a head page, but it is also nested. It will detect that
738it is nested and will save that information. The detection is the
739fact that it sees the UPDATE flag instead of a HEADER or NORMAL
740pointer.
741
742The nested writer will set the new head page pointer.
743
744 tail page
745 |
746 v
747 +---+ +---+ +---+ +---+
748<---| |--->| |-U->| |-H->| |--->
749--->| |<---| |<---| |<---| |<---
750 +---+ +---+ +---+ +---+
751
752But it will not reset the update back to normal. Only the writer
753that converted a pointer from HEAD to UPDATE will convert it back
754to NORMAL.
755
756 tail page
757 |
758 v
759 +---+ +---+ +---+ +---+
760<---| |--->| |-U->| |-H->| |--->
761--->| |<---| |<---| |<---| |<---
762 +---+ +---+ +---+ +---+
763
764After the nested writer finishes, the outer most writer will convert
765the UPDATE pointer to NORMAL.
766
767
768 tail page
769 |
770 v
771 +---+ +---+ +---+ +---+
772<---| |--->| |--->| |-H->| |--->
773--->| |<---| |<---| |<---| |<---
774 +---+ +---+ +---+ +---+
775
776
777It can be even more complex if several nested writes came in and moved
778the tail page ahead several pages:
779
780
781(first writer)
782
783 tail page
784 |
785 v
786 +---+ +---+ +---+ +---+
787<---| |--->| |-H->| |--->| |--->
788--->| |<---| |<---| |<---| |<---
789 +---+ +---+ +---+ +---+
790
791The write converts the head page pointer to UPDATE.
792
793 tail page
794 |
795 v
796 +---+ +---+ +---+ +---+
797<---| |--->| |-U->| |--->| |--->
798--->| |<---| |<---| |<---| |<---
799 +---+ +---+ +---+ +---+
800
801Next writer comes in, and sees the update and sets up the new
802head page.
803
804(second writer)
805
806 tail page
807 |
808 v
809 +---+ +---+ +---+ +---+
810<---| |--->| |-U->| |-H->| |--->
811--->| |<---| |<---| |<---| |<---
812 +---+ +---+ +---+ +---+
813
814The nested writer moves the tail page forward. But does not set the old
815update page to NORMAL because it is not the outer most writer.
816
817 tail page
818 |
819 v
820 +---+ +---+ +---+ +---+
821<---| |--->| |-U->| |-H->| |--->
822--->| |<---| |<---| |<---| |<---
823 +---+ +---+ +---+ +---+
824
825Another writer preempts and sees the page after the tail page is a head page.
826It changes it from HEAD to UPDATE.
827
828(third writer)
829
830 tail page
831 |
832 v
833 +---+ +---+ +---+ +---+
834<---| |--->| |-U->| |-U->| |--->
835--->| |<---| |<---| |<---| |<---
836 +---+ +---+ +---+ +---+
837
838The writer will move the head page forward:
839
840
841(third writer)
842
843 tail page
844 |
845 v
846 +---+ +---+ +---+ +---+
847<---| |--->| |-U->| |-U->| |-H->
848--->| |<---| |<---| |<---| |<---
849 +---+ +---+ +---+ +---+
850
851But now that the third writer did change the HEAD flag to UPDATE it
852will convert it to normal:
853
854
855(third writer)
856
857 tail page
858 |
859 v
860 +---+ +---+ +---+ +---+
861<---| |--->| |-U->| |--->| |-H->
862--->| |<---| |<---| |<---| |<---
863 +---+ +---+ +---+ +---+
864
865
866Then it will move the tail page, and return back to the second writer.
867
868
869(second writer)
870
871 tail page
872 |
873 v
874 +---+ +---+ +---+ +---+
875<---| |--->| |-U->| |--->| |-H->
876--->| |<---| |<---| |<---| |<---
877 +---+ +---+ +---+ +---+
878
879
880The second writer will fail to move the tail page because it was already
881moved, so it will try again and add its data to the new tail page.
882It will return to the first writer.
883
884
885(first writer)
886
887 tail page
888 |
889 v
890 +---+ +---+ +---+ +---+
891<---| |--->| |-U->| |--->| |-H->
892--->| |<---| |<---| |<---| |<---
893 +---+ +---+ +---+ +---+
894
895The first writer can not know atomically test if the tail page moved
896while it updates the HEAD page. It will then update the head page to
897what it thinks is the new head page.
898
899
900(first writer)
901
902 tail page
903 |
904 v
905 +---+ +---+ +---+ +---+
906<---| |--->| |-U->| |-H->| |-H->
907--->| |<---| |<---| |<---| |<---
908 +---+ +---+ +---+ +---+
909
910Since the cmpxchg returns the old value of the pointer the first writer
911will see it succeeded in updating the pointer from NORMAL to HEAD.
912But as we can see, this is not good enough. It must also check to see
913if the tail page is either where it use to be or on the next page:
914
915
916(first writer)
917
918 A B tail page
919 | | |
920 v v v
921 +---+ +---+ +---+ +---+
922<---| |--->| |-U->| |-H->| |-H->
923--->| |<---| |<---| |<---| |<---
924 +---+ +---+ +---+ +---+
925
926If tail page != A and tail page does not equal B, then it must reset the
927pointer back to NORMAL. The fact that it only needs to worry about
928nested writers, it only needs to check this after setting the HEAD page.
929
930
931(first writer)
932
933 A B tail page
934 | | |
935 v v v
936 +---+ +---+ +---+ +---+
937<---| |--->| |-U->| |--->| |-H->
938--->| |<---| |<---| |<---| |<---
939 +---+ +---+ +---+ +---+
940
941Now the writer can update the head page. This is also why the head page must
942remain in UPDATE and only reset by the outer most writer. This prevents
943the reader from seeing the incorrect head page.
944
945
946(first writer)
947
948 A B tail page
949 | | |
950 v v v
951 +---+ +---+ +---+ +---+
952<---| |--->| |--->| |--->| |-H->
953--->| |<---| |<---| |<---| |<---
954 +---+ +---+ +---+ +---+
955
diff --git a/Documentation/vgaarbiter.txt b/Documentation/vgaarbiter.txt
new file mode 100644
index 000000000000..987f9b0a5ece
--- /dev/null
+++ b/Documentation/vgaarbiter.txt
@@ -0,0 +1,194 @@
1
2VGA Arbiter
3===========
4
5Graphic devices are accessed through ranges in I/O or memory space. While most
6modern devices allow relocation of such ranges, some "Legacy" VGA devices
7implemented on PCI will typically have the same "hard-decoded" addresses as
8they did on ISA. For more details see "PCI Bus Binding to IEEE Std 1275-1994
9Standard for Boot (Initialization Configuration) Firmware Revision 2.1"
10Section 7, Legacy Devices.
11
12The Resource Access Control (RAC) module inside the X server [0] existed for
13the legacy VGA arbitration task (besides other bus management tasks) when more
14than one legacy device co-exists on the same machine. But the problem happens
15when these devices are trying to be accessed by different userspace clients
16(e.g. two server in parallel). Their address assignments conflict. Moreover,
17ideally, being an userspace application, it is not the role of the the X
18server to control bus resources. Therefore an arbitration scheme outside of
19the X server is needed to control the sharing of these resources. This
20document introduces the operation of the VGA arbiter implemented for Linux
21kernel.
22
23----------------------------------------------------------------------------
24
25I. Details and Theory of Operation
26 I.1 vgaarb
27 I.2 libpciaccess
28 I.3 xf86VGAArbiter (X server implementation)
29II. Credits
30III.References
31
32
33I. Details and Theory of Operation
34==================================
35
36I.1 vgaarb
37----------
38
39The vgaarb is a module of the Linux Kernel. When it is initially loaded, it
40scans all PCI devices and adds the VGA ones inside the arbitration. The
41arbiter then enables/disables the decoding on different devices of the VGA
42legacy instructions. Device which do not want/need to use the arbiter may
43explicitly tell it by calling vga_set_legacy_decoding().
44
45The kernel exports a char device interface (/dev/vga_arbiter) to the clients,
46which has the following semantics:
47
48 open : open user instance of the arbiter. By default, it's attached to
49 the default VGA device of the system.
50
51 close : close user instance. Release locks made by the user
52
53 read : return a string indicating the status of the target like:
54
55 "<card_ID>,decodes=<io_state>,owns=<io_state>,locks=<io_state> (ic,mc)"
56
57 An IO state string is of the form {io,mem,io+mem,none}, mc and
58 ic are respectively mem and io lock counts (for debugging/
59 diagnostic only). "decodes" indicate what the card currently
60 decodes, "owns" indicates what is currently enabled on it, and
61 "locks" indicates what is locked by this card. If the card is
62 unplugged, we get "invalid" then for card_ID and an -ENODEV
63 error is returned for any command until a new card is targeted.
64
65
66 write : write a command to the arbiter. List of commands:
67
68 target <card_ID> : switch target to card <card_ID> (see below)
69 lock <io_state> : acquires locks on target ("none" is an invalid io_state)
70 trylock <io_state> : non-blocking acquire locks on target (returns EBUSY if
71 unsuccessful)
72 unlock <io_state> : release locks on target
73 unlock all : release all locks on target held by this user (not
74 implemented yet)
75 decodes <io_state> : set the legacy decoding attributes for the card
76
77 poll : event if something changes on any card (not just the
78 target)
79
80 card_ID is of the form "PCI:domain:bus:dev.fn". It can be set to "default"
81 to go back to the system default card (TODO: not implemented yet). Currently,
82 only PCI is supported as a prefix, but the userland API may support other bus
83 types in the future, even if the current kernel implementation doesn't.
84
85Note about locks:
86
87The driver keeps track of which user has which locks on which card. It
88supports stacking, like the kernel one. This complexifies the implementation
89a bit, but makes the arbiter more tolerant to user space problems and able
90to properly cleanup in all cases when a process dies.
91Currently, a max of 16 cards can have locks simultaneously issued from
92user space for a given user (file descriptor instance) of the arbiter.
93
94In the case of devices hot-{un,}plugged, there is a hook - pci_notify() - to
95notify them being added/removed in the system and automatically added/removed
96in the arbiter.
97
98There's also a in-kernel API of the arbiter in the case of DRM, vgacon and
99others which may use the arbiter.
100
101
102I.2 libpciaccess
103----------------
104
105To use the vga arbiter char device it was implemented an API inside the
106libpciaccess library. One fieldd was added to struct pci_device (each device
107on the system):
108
109 /* the type of resource decoded by the device */
110 int vgaarb_rsrc;
111
112Besides it, in pci_system were added:
113
114 int vgaarb_fd;
115 int vga_count;
116 struct pci_device *vga_target;
117 struct pci_device *vga_default_dev;
118
119
120The vga_count is usually need to keep informed how many cards are being
121arbitrated, so for instance if there's only one then it can totally escape the
122scheme.
123
124
125These functions below acquire VGA resources for the given card and mark those
126resources as locked. If the resources requested are "normal" (and not legacy)
127resources, the arbiter will first check whether the card is doing legacy
128decoding for that type of resource. If yes, the lock is "converted" into a
129legacy resource lock. The arbiter will first look for all VGA cards that
130might conflict and disable their IOs and/or Memory access, including VGA
131forwarding on P2P bridges if necessary, so that the requested resources can
132be used. Then, the card is marked as locking these resources and the IO and/or
133Memory access is enabled on the card (including VGA forwarding on parent
134P2P bridges if any). In the case of vga_arb_lock(), the function will block
135if some conflicting card is already locking one of the required resources (or
136any resource on a different bus segment, since P2P bridges don't differentiate
137VGA memory and IO afaik). If the card already owns the resources, the function
138succeeds. vga_arb_trylock() will return (-EBUSY) instead of blocking. Nested
139calls are supported (a per-resource counter is maintained).
140
141
142Set the target device of this client.
143 int pci_device_vgaarb_set_target (struct pci_device *dev);
144
145
146For instance, in x86 if two devices on the same bus want to lock different
147resources, both will succeed (lock). If devices are in different buses and
148trying to lock different resources, only the first who tried succeeds.
149 int pci_device_vgaarb_lock (void);
150 int pci_device_vgaarb_trylock (void);
151
152Unlock resources of device.
153 int pci_device_vgaarb_unlock (void);
154
155Indicates to the arbiter if the card decodes legacy VGA IOs, legacy VGA
156Memory, both, or none. All cards default to both, the card driver (fbdev for
157example) should tell the arbiter if it has disabled legacy decoding, so the
158card can be left out of the arbitration process (and can be safe to take
159interrupts at any time.
160 int pci_device_vgaarb_decodes (int new_vgaarb_rsrc);
161
162Connects to the arbiter device, allocates the struct
163 int pci_device_vgaarb_init (void);
164
165Close the connection
166 void pci_device_vgaarb_fini (void);
167
168
169I.3 xf86VGAArbiter (X server implementation)
170--------------------------------------------
171
172(TODO)
173
174X server basically wraps all the functions that touch VGA registers somehow.
175
176
177II. Credits
178===========
179
180Benjamin Herrenschmidt (IBM?) started this work when he discussed such design
181with the Xorg community in 2005 [1, 2]. In the end of 2007, Paulo Zanoni and
182Tiago Vignatti (both of C3SL/Federal University of Paraná) proceeded his work
183enhancing the kernel code to adapt as a kernel module and also did the
184implementation of the user space side [3]. Now (2009) Tiago Vignatti and Dave
185Airlie finally put this work in shape and queued to Jesse Barnes' PCI tree.
186
187
188III. References
189==============
190
191[0] http://cgit.freedesktop.org/xorg/xserver/commit/?id=4b42448a2388d40f257774fbffdccaea87bd0347
192[1] http://lists.freedesktop.org/archives/xorg/2005-March/006663.html
193[2] http://lists.freedesktop.org/archives/xorg/2005-March/006745.html
194[3] http://lists.freedesktop.org/archives/xorg/2007-October/029507.html
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 91aa3c0f0dd2..525edb37c758 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -16,3 +16,10 @@
16 15 -> TeVii S470 [d470:9022] 16 15 -> TeVii S470 [d470:9022]
17 16 -> DVBWorld DVB-S2 2005 [0001:2005] 17 16 -> DVBWorld DVB-S2 2005 [0001:2005]
18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c] 18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c]
19 18 -> Hauppauge WinTV-HVR1270 [0070:2211]
20 19 -> Hauppauge WinTV-HVR1275 [0070:2215]
21 20 -> Hauppauge WinTV-HVR1255 [0070:2251]
22 21 -> Hauppauge WinTV-HVR1210 [0070:2291,0070:2295]
23 22 -> Mygica X8506 DMB-TH [14f1:8651]
24 23 -> Magic-Pro ProHDTV Extreme 2 [14f1:8657]
25 24 -> Hauppauge WinTV-HVR1850 [0070:8541]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 71e9db0b26f7..3385f8b094a5 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -6,8 +6,8 @@
6 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613] 6 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613]
7 6 -> AverTV Studio 303 (M126) [1461:000b] 7 6 -> AverTV Studio 303 (M126) [1461:000b]
8 7 -> MSI TV-@nywhere Master [1462:8606] 8 7 -> MSI TV-@nywhere Master [1462:8606]
9 8 -> Leadtek Winfast DV2000 [107d:6620] 9 8 -> Leadtek Winfast DV2000 [107d:6620,107d:6621]
10 9 -> Leadtek PVR 2000 [107d:663b,107d:663c,107d:6632] 10 9 -> Leadtek PVR 2000 [107d:663b,107d:663c,107d:6632,107d:6630,107d:6638,107d:6631,107d:6637,107d:663d]
11 10 -> IODATA GV-VCP3/PCI [10fc:d003] 11 10 -> IODATA GV-VCP3/PCI [10fc:d003]
12 11 -> Prolink PlayTV PVR 12 11 -> Prolink PlayTV PVR
13 12 -> ASUS PVR-416 [1043:4823,1461:c111] 13 12 -> ASUS PVR-416 [1043:4823,1461:c111]
@@ -59,7 +59,7 @@
59 58 -> Pinnacle PCTV HD 800i [11bd:0051] 59 58 -> Pinnacle PCTV HD 800i [11bd:0051]
60 59 -> DViCO FusionHDTV 5 PCI nano [18ac:d530] 60 59 -> DViCO FusionHDTV 5 PCI nano [18ac:d530]
61 60 -> Pinnacle Hybrid PCTV [12ab:1788] 61 60 -> Pinnacle Hybrid PCTV [12ab:1788]
62 61 -> Winfast TV2000 XP Global [107d:6f18] 62 61 -> Leadtek TV2000 XP Global [107d:6f18,107d:6618]
63 62 -> PowerColor RA330 [14f1:ea3d] 63 62 -> PowerColor RA330 [14f1:ea3d]
64 63 -> Geniatech X8000-MT DVBT [14f1:8852] 64 63 -> Geniatech X8000-MT DVBT [14f1:8852]
65 64 -> DViCO FusionHDTV DVB-T PRO [18ac:db30] 65 64 -> DViCO FusionHDTV DVB-T PRO [18ac:db30]
@@ -78,3 +78,6 @@
78 77 -> TBS 8910 DVB-S [8910:8888] 78 77 -> TBS 8910 DVB-S [8910:8888]
79 78 -> Prof 6200 DVB-S [b022:3022] 79 78 -> Prof 6200 DVB-S [b022:3022]
80 79 -> Terratec Cinergy HT PCI MKII [153b:1177] 80 79 -> Terratec Cinergy HT PCI MKII [153b:1177]
81 80 -> Hauppauge WinTV-IR Only [0070:9290]
82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654]
83 82 -> WinFast DTV2000 H rev. J [107d:6f2b]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 78d0a6eed571..b13fcbd5d94b 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -1,5 +1,5 @@
1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] 1 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800]
2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2870,eb1a:2881,eb1a:2883] 2 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2710,eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2870,eb1a:2881,eb1a:2883]
3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] 3 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036]
4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] 4 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208]
5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] 5 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201]
@@ -7,7 +7,7 @@
7 6 -> Terratec Cinergy 200 USB (em2800) 7 6 -> Terratec Cinergy 200 USB (em2800)
8 7 -> Leadtek Winfast USB II (em2800) [0413:6023] 8 7 -> Leadtek Winfast USB II (em2800) [0413:6023]
9 8 -> Kworld USB2800 (em2800) 9 8 -> Kworld USB2800 (em2800)
10 9 -> Pinnacle Dazzle DVC 90/100/101/107 / Kaiser Baas Video to DVD maker (em2820/em2840) [1b80:e302,2304:0207,2304:021a] 10 9 -> Pinnacle Dazzle DVC 90/100/101/107 / Kaiser Baas Video to DVD maker (em2820/em2840) [1b80:e302,1b80:e304,2304:0207,2304:021a]
11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500] 11 10 -> Hauppauge WinTV HVR 900 (em2880) [2040:6500]
12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042] 12 11 -> Terratec Hybrid XS (em2880) [0ccd:0042]
13 12 -> Kworld PVR TV 2800 RF (em2820/em2840) 13 12 -> Kworld PVR TV 2800 RF (em2820/em2840)
@@ -17,10 +17,10 @@
17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b]
18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502] 19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502]
20 19 -> PointNix Intra-Oral Camera (em2860) 20 19 -> EM2860/SAA711X Reference Design (em2860)
21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002] 21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002]
22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801] 22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801]
23 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751] 23 22 -> EM2710/EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751]
24 23 -> Huaqi DLCW-130 (em2750) 24 23 -> Huaqi DLCW-130 (em2750)
25 24 -> D-Link DUB-T210 TV Tuner (em2820/em2840) [2001:f112] 25 24 -> D-Link DUB-T210 TV Tuner (em2820/em2840) [2001:f112]
26 25 -> Gadmei UTV310 (em2820/em2840) 26 25 -> Gadmei UTV310 (em2820/em2840)
@@ -33,7 +33,7 @@
33 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f] 33 34 -> Terratec Cinergy A Hybrid XS (em2860) [0ccd:004f]
34 35 -> Typhoon DVD Maker (em2860) 34 35 -> Typhoon DVD Maker (em2860)
35 36 -> NetGMBH Cam (em2860) 35 36 -> NetGMBH Cam (em2860)
36 37 -> Gadmei UTV330 (em2860) 36 37 -> Gadmei UTV330 (em2860) [eb1a:50a6]
37 38 -> Yakumo MovieMixer (em2861) 37 38 -> Yakumo MovieMixer (em2861)
38 39 -> KWorld PVRTV 300U (em2861) [eb1a:e300] 38 39 -> KWorld PVRTV 300U (em2861) [eb1a:e300]
39 40 -> Plextor ConvertX PX-TV100U (em2861) [093b:a005] 39 40 -> Plextor ConvertX PX-TV100U (em2861) [093b:a005]
@@ -61,3 +61,10 @@
61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303] 61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303]
62 64 -> Easy Cap Capture DC-60 (em2860) 62 64 -> Easy Cap Capture DC-60 (em2860)
63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515] 63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515]
64 66 -> Empire dual TV (em2880)
65 67 -> Terratec Grabby (em2860) [0ccd:0096]
66 68 -> Terratec AV350 (em2860) [0ccd:0084]
67 69 -> KWorld ATSC 315U HDTV TV Box (em2882) [eb1a:a313]
68 70 -> Evga inDtube (em2882)
69 71 -> Silvercrest Webcam 1.3mpix (em2820/em2840)
70 72 -> Gadmei UTV330+ (em2861)
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index 6dacf2825259..0ac4d2544778 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -124,10 +124,10 @@
124123 -> Beholder BeholdTV 407 [0000:4070] 124123 -> Beholder BeholdTV 407 [0000:4070]
125124 -> Beholder BeholdTV 407 FM [0000:4071] 125124 -> Beholder BeholdTV 407 FM [0000:4071]
126125 -> Beholder BeholdTV 409 [0000:4090] 126125 -> Beholder BeholdTV 409 [0000:4090]
127126 -> Beholder BeholdTV 505 FM/RDS [0000:5051,0000:505B,5ace:5050] 127126 -> Beholder BeholdTV 505 FM [5ace:5050]
128127 -> Beholder BeholdTV 507 FM/RDS / BeholdTV 509 FM [0000:5071,0000:507B,5ace:5070,5ace:5090] 128127 -> Beholder BeholdTV 507 FM / BeholdTV 509 FM [5ace:5070,5ace:5090]
129128 -> Beholder BeholdTV Columbus TVFM [0000:5201] 129128 -> Beholder BeholdTV Columbus TVFM [0000:5201]
130129 -> Beholder BeholdTV 607 / BeholdTV 609 [5ace:6070,5ace:6071,5ace:6072,5ace:6073,5ace:6090,5ace:6091,5ace:6092,5ace:6093] 130129 -> Beholder BeholdTV 607 FM [5ace:6070]
131130 -> Beholder BeholdTV M6 [5ace:6190] 131130 -> Beholder BeholdTV M6 [5ace:6190]
132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022] 132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022]
133132 -> Genius TVGO AM11MCE 133132 -> Genius TVGO AM11MCE
@@ -143,7 +143,7 @@
143142 -> Beholder BeholdTV H6 [5ace:6290] 143142 -> Beholder BeholdTV H6 [5ace:6290]
144143 -> Beholder BeholdTV M63 [5ace:6191] 144143 -> Beholder BeholdTV M63 [5ace:6191]
145144 -> Beholder BeholdTV M6 Extra [5ace:6193] 145144 -> Beholder BeholdTV M6 Extra [5ace:6193]
146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636] 146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636,1461:f736]
147146 -> ASUSTeK P7131 Analog 147146 -> ASUSTeK P7131 Analog
148147 -> Asus Tiger 3in1 [1043:4878] 148147 -> Asus Tiger 3in1 [1043:4878]
149148 -> Encore ENLTV-FM v5.3 [1a7f:2008] 149148 -> Encore ENLTV-FM v5.3 [1a7f:2008]
@@ -153,5 +153,21 @@
153152 -> Asus Tiger Rev:1.00 [1043:4857] 153152 -> Asus Tiger Rev:1.00 [1043:4857]
154153 -> Kworld Plus TV Analog Lite PCI [17de:7128] 154153 -> Kworld Plus TV Analog Lite PCI [17de:7128]
155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] 155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d]
156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708] 156155 -> Hauppauge WinTV-HVR1150 ATSC/QAM-Hybrid [0070:6706,0070:6708]
157156 -> Hauppauge WinTV-HVR1110r3 [0070:6707,0070:6709,0070:670a] 157156 -> Hauppauge WinTV-HVR1120 DVB-T/Hybrid [0070:6707,0070:6709,0070:670a]
158157 -> Avermedia AVerTV Studio 507UA [1461:a11b]
159158 -> AVerMedia Cardbus TV/Radio (E501R) [1461:b7e9]
160159 -> Beholder BeholdTV 505 RDS [0000:505B]
161160 -> Beholder BeholdTV 507 RDS [0000:5071]
162161 -> Beholder BeholdTV 507 RDS [0000:507B]
163162 -> Beholder BeholdTV 607 FM [5ace:6071]
164163 -> Beholder BeholdTV 609 FM [5ace:6090]
165164 -> Beholder BeholdTV 609 FM [5ace:6091]
166165 -> Beholder BeholdTV 607 RDS [5ace:6072]
167166 -> Beholder BeholdTV 607 RDS [5ace:6073]
168167 -> Beholder BeholdTV 609 RDS [5ace:6092]
169168 -> Beholder BeholdTV 609 RDS [5ace:6093]
170169 -> Compro VideoMate S350/S300 [185b:c900]
171170 -> AverMedia AverTV Studio 505 [1461:a115]
172171 -> Beholder BeholdTV X7 [5ace:7595]
173172 -> RoverMedia TV Link Pro FM [19d1:0138]
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner
index 691d2f37dc57..ba9fa679e2d3 100644
--- a/Documentation/video4linux/CARDLIST.tuner
+++ b/Documentation/video4linux/CARDLIST.tuner
@@ -76,3 +76,6 @@ tuner=75 - Philips TEA5761 FM Radio
76tuner=76 - Xceive 5000 tuner 76tuner=76 - Xceive 5000 tuner
77tuner=77 - TCL tuner MF02GIP-5N-E 77tuner=77 - TCL tuner MF02GIP-5N-E
78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner 78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner
79tuner=79 - Philips PAL/SECAM multi (FM1216 MK5)
80tuner=80 - Philips FQ1216LME MK3 PAL/SECAM w/active loopthrough
81tuner=81 - Partsnic (Daewoo) PTI-5NF05
diff --git a/Documentation/video4linux/CQcam.txt b/Documentation/video4linux/CQcam.txt
index 04986efb731c..d230878e473e 100644
--- a/Documentation/video4linux/CQcam.txt
+++ b/Documentation/video4linux/CQcam.txt
@@ -18,8 +18,8 @@ Table of Contents
18 18
191.0 Introduction 191.0 Introduction
20 20
21 The file ../drivers/char/c-qcam.c is a device driver for the 21 The file ../../drivers/media/video/c-qcam.c is a device driver for
22Logitech (nee Connectix) parallel port interface color CCD camera. 22the Logitech (nee Connectix) parallel port interface color CCD camera.
23This is a fairly inexpensive device for capturing images. Logitech 23This is a fairly inexpensive device for capturing images. Logitech
24does not currently provide information for developers, but many people 24does not currently provide information for developers, but many people
25have engineered several solutions for non-Microsoft use of the Color 25have engineered several solutions for non-Microsoft use of the Color
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 98529e03a46e..4686e84dd800 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -44,7 +44,9 @@ zc3xx 0458:7007 Genius VideoCam V2
44zc3xx 0458:700c Genius VideoCam V3 44zc3xx 0458:700c Genius VideoCam V3
45zc3xx 0458:700f Genius VideoCam Web V2 45zc3xx 0458:700f Genius VideoCam Web V2
46sonixj 0458:7025 Genius Eye 311Q 46sonixj 0458:7025 Genius Eye 311Q
47sn9c20x 0458:7029 Genius Look 320s
47sonixj 0458:702e Genius Slim 310 NB 48sonixj 0458:702e Genius Slim 310 NB
49sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650)
48sonixj 045e:00f5 MicroSoft VX3000 50sonixj 045e:00f5 MicroSoft VX3000
49sonixj 045e:00f7 MicroSoft VX1000 51sonixj 045e:00f7 MicroSoft VX1000
50ov519 045e:028c Micro$oft xbox cam 52ov519 045e:028c Micro$oft xbox cam
@@ -138,6 +140,7 @@ spca500 04fc:7333 PalmPixDC85
138sunplus 04fc:ffff Pure DigitalDakota 140sunplus 04fc:ffff Pure DigitalDakota
139spca501 0506:00df 3Com HomeConnect Lite 141spca501 0506:00df 3Com HomeConnect Lite
140sunplus 052b:1513 Megapix V4 142sunplus 052b:1513 Megapix V4
143sunplus 052b:1803 MegaImage VI
141tv8532 0545:808b Veo Stingray 144tv8532 0545:808b Veo Stingray
142tv8532 0545:8333 Veo Stingray 145tv8532 0545:8333 Veo Stingray
143sunplus 0546:3155 Polaroid PDC3070 146sunplus 0546:3155 Polaroid PDC3070
@@ -163,10 +166,11 @@ sunplus 055f:c650 Mustek MDC5500Z
163zc3xx 055f:d003 Mustek WCam300A 166zc3xx 055f:d003 Mustek WCam300A
164zc3xx 055f:d004 Mustek WCam300 AN 167zc3xx 055f:d004 Mustek WCam300 AN
165conex 0572:0041 Creative Notebook cx11646 168conex 0572:0041 Creative Notebook cx11646
166ov519 05a9:0519 OmniVision 169ov519 05a9:0519 OV519 Microphone
167ov519 05a9:0530 OmniVision 170ov519 05a9:0530 OmniVision
168ov519 05a9:4519 OmniVision 171ov519 05a9:4519 Webcam Classic
169ov519 05a9:8519 OmniVision 172ov519 05a9:8519 OmniVision
173ov519 05a9:a518 D-Link DSB-C310 Webcam
170sunplus 05da:1018 Digital Dream Enigma 1.3 174sunplus 05da:1018 Digital Dream Enigma 1.3
171stk014 05e1:0893 Syntek DV4000 175stk014 05e1:0893 Syntek DV4000
172spca561 060b:a001 Maxell Compact Pc PM3 176spca561 060b:a001 Maxell Compact Pc PM3
@@ -178,6 +182,8 @@ spca506 06e1:a190 ADS Instant VCD
178ov534 06f8:3002 Hercules Blog Webcam 182ov534 06f8:3002 Hercules Blog Webcam
179ov534 06f8:3003 Hercules Dualpix HD Weblog 183ov534 06f8:3003 Hercules Dualpix HD Weblog
180sonixj 06f8:3004 Hercules Classic Silver 184sonixj 06f8:3004 Hercules Classic Silver
185sonixj 06f8:3008 Hercules Deluxe Optical Glass
186pac7311 06f8:3009 Hercules Classic Link
181spca508 0733:0110 ViewQuest VQ110 187spca508 0733:0110 ViewQuest VQ110
182spca508 0130:0130 Clone Digital Webcam 11043 188spca508 0130:0130 Clone Digital Webcam 11043
183spca501 0733:0401 Intel Create and Share 189spca501 0733:0401 Intel Create and Share
@@ -209,6 +215,7 @@ sunplus 08ca:2050 Medion MD 41437
209sunplus 08ca:2060 Aiptek PocketDV5300 215sunplus 08ca:2060 Aiptek PocketDV5300
210tv8532 0923:010f ICM532 cams 216tv8532 0923:010f ICM532 cams
211mars 093a:050f Mars-Semi Pc-Camera 217mars 093a:050f Mars-Semi Pc-Camera
218mr97310a 093a:010f Sakar Digital no. 77379
212pac207 093a:2460 Qtec Webcam 100 219pac207 093a:2460 Qtec Webcam 100
213pac207 093a:2461 HP Webcam 220pac207 093a:2461 HP Webcam
214pac207 093a:2463 Philips SPC 220 NC 221pac207 093a:2463 Philips SPC 220 NC
@@ -230,8 +237,10 @@ pac7311 093a:2621 PAC731x
230pac7311 093a:2622 Genius Eye 312 237pac7311 093a:2622 Genius Eye 312
231pac7311 093a:2624 PAC7302 238pac7311 093a:2624 PAC7302
232pac7311 093a:2626 Labtec 2200 239pac7311 093a:2626 Labtec 2200
240pac7311 093a:2629 Genious iSlim 300
233pac7311 093a:262a Webcam 300k 241pac7311 093a:262a Webcam 300k
234pac7311 093a:262c Philips SPC 230 NC 242pac7311 093a:262c Philips SPC 230 NC
243jeilinj 0979:0280 Sakar 57379
235zc3xx 0ac8:0302 Z-star Vimicro zc0302 244zc3xx 0ac8:0302 Z-star Vimicro zc0302
236vc032x 0ac8:0321 Vimicro generic vc0321 245vc032x 0ac8:0321 Vimicro generic vc0321
237vc032x 0ac8:0323 Vimicro Vc0323 246vc032x 0ac8:0323 Vimicro Vc0323
@@ -242,6 +251,7 @@ zc3xx 0ac8:305b Z-star Vimicro zc0305b
242zc3xx 0ac8:307b Ldlc VC302+Ov7620 251zc3xx 0ac8:307b Ldlc VC302+Ov7620
243vc032x 0ac8:c001 Sony embedded vimicro 252vc032x 0ac8:c001 Sony embedded vimicro
244vc032x 0ac8:c002 Sony embedded vimicro 253vc032x 0ac8:c002 Sony embedded vimicro
254vc032x 0ac8:c301 Samsung Q1 Ultra Premium
245spca508 0af9:0010 Hama USB Sightcam 100 255spca508 0af9:0010 Hama USB Sightcam 100
246spca508 0af9:0011 Hama USB Sightcam 100 256spca508 0af9:0011 Hama USB Sightcam 100
247sonixb 0c45:6001 Genius VideoCAM NB 257sonixb 0c45:6001 Genius VideoCAM NB
@@ -265,6 +275,11 @@ sonixj 0c45:60ec SN9C105+MO4000
265sonixj 0c45:60fb Surfer NoName 275sonixj 0c45:60fb Surfer NoName
266sonixj 0c45:60fc LG-LIC300 276sonixj 0c45:60fc LG-LIC300
267sonixj 0c45:60fe Microdia Audio 277sonixj 0c45:60fe Microdia Audio
278sonixj 0c45:6100 PC Camera (SN9C128)
279sonixj 0c45:610a PC Camera (SN9C128)
280sonixj 0c45:610b PC Camera (SN9C128)
281sonixj 0c45:610c PC Camera (SN9C128)
282sonixj 0c45:610e PC Camera (SN9C128)
268sonixj 0c45:6128 Microdia/Sonix SNP325 283sonixj 0c45:6128 Microdia/Sonix SNP325
269sonixj 0c45:612a Avant Camera 284sonixj 0c45:612a Avant Camera
270sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix 285sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix
@@ -274,6 +289,29 @@ sonixj 0c45:613a Microdia Sonix PC Camera
274sonixj 0c45:613b Surfer SN-206 289sonixj 0c45:613b Surfer SN-206
275sonixj 0c45:613c Sonix Pccam168 290sonixj 0c45:613c Sonix Pccam168
276sonixj 0c45:6143 Sonix Pccam168 291sonixj 0c45:6143 Sonix Pccam168
292sonixj 0c45:6148 Digitus DA-70811/ZSMC USB PC Camera ZS211/Microdia
293sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001)
294sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111)
295sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655)
296sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968)
297sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650)
298sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650)
299sn9c20x 0c45:6253 PC Camera (SN9C201 + OV9650)
300sn9c20x 0c45:6260 PC Camera (SN9C201 + OV7670)
301sn9c20x 0c45:6270 PC Camera (SN9C201 + MT9V011/MT9V111/MT9V112)
302sn9c20x 0c45:627b PC Camera (SN9C201 + OV7660)
303sn9c20x 0c45:627c PC Camera (SN9C201 + HV7131R)
304sn9c20x 0c45:627f PC Camera (SN9C201 + OV9650)
305sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001)
306sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111)
307sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655)
308sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968)
309sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650)
310sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670)
311sn9c20x 0c45:62b0 PC Camera (SN9C202 + MT9V011/MT9V111/MT9V112)
312sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655)
313sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660)
314sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R)
277sunplus 0d64:0303 Sunplus FashionCam DXG 315sunplus 0d64:0303 Sunplus FashionCam DXG
278etoms 102c:6151 Qcam Sangha CIF 316etoms 102c:6151 Qcam Sangha CIF
279etoms 102c:6251 Qcam xxxxxx VGA 317etoms 102c:6251 Qcam xxxxxx VGA
@@ -282,6 +320,7 @@ spca561 10fd:7e50 FlyCam Usb 100
282zc3xx 10fd:8050 Typhoon Webshot II USB 300k 320zc3xx 10fd:8050 Typhoon Webshot II USB 300k
283ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201) 321ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201)
284pac207 145f:013a Trust WB-1300N 322pac207 145f:013a Trust WB-1300N
323sn9c20x 145f:013d Trust WB-3600R
285vc032x 15b8:6001 HP 2.0 Megapixel 324vc032x 15b8:6001 HP 2.0 Megapixel
286vc032x 15b8:6002 HP 2.0 Megapixel rz406aa 325vc032x 15b8:6002 HP 2.0 Megapixel rz406aa
287spca501 1776:501c Arowana 300K CMOS Camera 326spca501 1776:501c Arowana 300K CMOS Camera
@@ -292,4 +331,11 @@ spca500 2899:012c Toptro Industrial
292spca508 8086:0110 Intel Easy PC Camera 331spca508 8086:0110 Intel Easy PC Camera
293spca500 8086:0630 Intel Pocket PC Camera 332spca500 8086:0630 Intel Pocket PC Camera
294spca506 99fa:8988 Grandtec V.cap 333spca506 99fa:8988 Grandtec V.cap
334sn9c20x a168:0610 Dino-Lite Digital Microscope (SN9C201 + HV7131R)
335sn9c20x a168:0611 Dino-Lite Digital Microscope (SN9C201 + HV7131R)
336sn9c20x a168:0613 Dino-Lite Digital Microscope (SN9C201 + HV7131R)
337sn9c20x a168:0618 Dino-Lite Digital Microscope (SN9C201 + HV7131R)
338sn9c20x a168:0614 Dino-Lite Digital Microscope (SN9C201 + MT9M111)
339sn9c20x a168:0615 Dino-Lite Digital Microscope (SN9C201 + MT9M111)
340sn9c20x a168:0617 Dino-Lite Digital Microscope (SN9C201 + MT9M111)
295spca561 abcd:cdee Petcam 341spca561 abcd:cdee Petcam
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt
index b1137f9a53eb..4f6d0ca01956 100644
--- a/Documentation/video4linux/pxa_camera.txt
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -26,6 +26,55 @@ Global video workflow
26 26
27 Once the last buffer is filled in, the QCI interface stops. 27 Once the last buffer is filled in, the QCI interface stops.
28 28
29 c) Capture global finite state machine schema
30
31 +----+ +---+ +----+
32 | DQ | | Q | | DQ |
33 | v | v | v
34 +-----------+ +------------------------+
35 | STOP | | Wait for capture start |
36 +-----------+ Q +------------------------+
37+-> | QCI: stop | ------------------> | QCI: run | <------------+
38| | DMA: stop | | DMA: stop | |
39| +-----------+ +-----> +------------------------+ |
40| / | |
41| / +---+ +----+ | |
42|capture list empty / | Q | | DQ | | QCI Irq EOF |
43| / | v | v v |
44| +--------------------+ +----------------------+ |
45| | DMA hotlink missed | | Capture running | |
46| +--------------------+ +----------------------+ |
47| | QCI: run | +-----> | QCI: run | <-+ |
48| | DMA: stop | / | DMA: run | | |
49| +--------------------+ / +----------------------+ | Other |
50| ^ /DMA still | | channels |
51| | capture list / running | DMA Irq End | not |
52| | not empty / | | finished |
53| | / v | yet |
54| +----------------------+ +----------------------+ | |
55| | Videobuf released | | Channel completed | | |
56| +----------------------+ +----------------------+ | |
57+-- | QCI: run | | QCI: run | --+ |
58 | DMA: run | | DMA: run | |
59 +----------------------+ +----------------------+ |
60 ^ / | |
61 | no overrun / | overrun |
62 | / v |
63 +--------------------+ / +----------------------+ |
64 | Frame completed | / | Frame overran | |
65 +--------------------+ <-----+ +----------------------+ restart frame |
66 | QCI: run | | QCI: stop | --------------+
67 | DMA: run | | DMA: stop |
68 +--------------------+ +----------------------+
69
70 Legend: - each box is a FSM state
71 - each arrow is the condition to transition to another state
72 - an arrow with a comment is a mandatory transition (no condition)
73 - arrow "Q" means : a buffer was enqueued
74 - arrow "DQ" means : a buffer was dequeued
75 - "QCI: stop" means the QCI interface is not enabled
76 - "DMA: stop" means all 3 DMA channels are stopped
77 - "DMA: run" means at least 1 DMA channel is still running
29 78
30DMA usage 79DMA usage
31--------- 80---------
diff --git a/Documentation/video4linux/si4713.txt b/Documentation/video4linux/si4713.txt
new file mode 100644
index 000000000000..25abdb78209d
--- /dev/null
+++ b/Documentation/video4linux/si4713.txt
@@ -0,0 +1,176 @@
1Driver for I2C radios for the Silicon Labs Si4713 FM Radio Transmitters
2
3Copyright (c) 2009 Nokia Corporation
4Contact: Eduardo Valentin <eduardo.valentin@nokia.com>
5
6
7Information about the Device
8============================
9This chip is a Silicon Labs product. It is a I2C device, currently on 0x63 address.
10Basically, it has transmission and signal noise level measurement features.
11
12The Si4713 integrates transmit functions for FM broadcast stereo transmission.
13The chip also allows integrated receive power scanning to identify low signal
14power FM channels.
15
16The chip is programmed using commands and responses. There are also several
17properties which can change the behavior of this chip.
18
19Users must comply with local regulations on radio frequency (RF) transmission.
20
21Device driver description
22=========================
23There are two modules to handle this device. One is a I2C device driver
24and the other is a platform driver.
25
26The I2C device driver exports a v4l2-subdev interface to the kernel.
27All properties can also be accessed by v4l2 extended controls interface, by
28using the v4l2-subdev calls (g_ext_ctrls, s_ext_ctrls).
29
30The platform device driver exports a v4l2 radio device interface to user land.
31So, it uses the I2C device driver as a sub device in order to send the user
32commands to the actual device. Basically it is a wrapper to the I2C device driver.
33
34Applications can use v4l2 radio API to specify frequency of operation, mute state,
35etc. But mostly of its properties will be present in the extended controls.
36
37When the v4l2 mute property is set to 1 (true), the driver will turn the chip off.
38
39Properties description
40======================
41
42The properties can be accessed using v4l2 extended controls.
43Here is an output from v4l2-ctl util:
44/ # v4l2-ctl -d /dev/radio0 --all -L
45Driver Info:
46 Driver name : radio-si4713
47 Card type : Silicon Labs Si4713 Modulator
48 Bus info :
49 Driver version: 0
50 Capabilities : 0x00080800
51 RDS Output
52 Modulator
53Audio output: 0 (FM Modulator Audio Out)
54Frequency: 1408000 (88.000000 MHz)
55Video Standard = 0x00000000
56Modulator:
57 Name : FM Modulator
58 Capabilities : 62.5 Hz stereo rds
59 Frequency range : 76.0 MHz - 108.0 MHz
60 Subchannel modulation: stereo+rds
61
62User Controls
63
64 mute (bool) : default=1 value=0
65
66FM Radio Modulator Controls
67
68 rds_signal_deviation (int) : min=0 max=90000 step=10 default=200 value=200 flags=slider
69 rds_program_id (int) : min=0 max=65535 step=1 default=0 value=0
70 rds_program_type (int) : min=0 max=31 step=1 default=0 value=0
71 rds_ps_name (str) : min=0 max=96 step=8 value='si4713 '
72 rds_radio_text (str) : min=0 max=384 step=32 value=''
73 audio_limiter_feature_enabled (bool) : default=1 value=1
74 audio_limiter_release_time (int) : min=250 max=102390 step=50 default=5010 value=5010 flags=slider
75 audio_limiter_deviation (int) : min=0 max=90000 step=10 default=66250 value=66250 flags=slider
76audio_compression_feature_enabl (bool) : default=1 value=1
77 audio_compression_gain (int) : min=0 max=20 step=1 default=15 value=15 flags=slider
78 audio_compression_threshold (int) : min=-40 max=0 step=1 default=-40 value=-40 flags=slider
79 audio_compression_attack_time (int) : min=0 max=5000 step=500 default=0 value=0 flags=slider
80 audio_compression_release_time (int) : min=100000 max=1000000 step=100000 default=1000000 value=1000000 flags=slider
81 pilot_tone_feature_enabled (bool) : default=1 value=1
82 pilot_tone_deviation (int) : min=0 max=90000 step=10 default=6750 value=6750 flags=slider
83 pilot_tone_frequency (int) : min=0 max=19000 step=1 default=19000 value=19000 flags=slider
84 pre_emphasis_settings (menu) : min=0 max=2 default=1 value=1
85 tune_power_level (int) : min=0 max=120 step=1 default=88 value=88 flags=slider
86 tune_antenna_capacitor (int) : min=0 max=191 step=1 default=0 value=110 flags=slider
87/ #
88
89Here is a summary of them:
90
91* Pilot is an audible tone sent by the device.
92
93pilot_frequency - Configures the frequency of the stereo pilot tone.
94pilot_deviation - Configures pilot tone frequency deviation level.
95pilot_enabled - Enables or disables the pilot tone feature.
96
97* The si4713 device is capable of applying audio compression to the transmitted signal.
98
99acomp_enabled - Enables or disables the audio dynamic range control feature.
100acomp_gain - Sets the gain for audio dynamic range control.
101acomp_threshold - Sets the threshold level for audio dynamic range control.
102acomp_attack_time - Sets the attack time for audio dynamic range control.
103acomp_release_time - Sets the release time for audio dynamic range control.
104
105* Limiter setups audio deviation limiter feature. Once a over deviation occurs,
106it is possible to adjust the front-end gain of the audio input and always
107prevent over deviation.
108
109limiter_enabled - Enables or disables the limiter feature.
110limiter_deviation - Configures audio frequency deviation level.
111limiter_release_time - Sets the limiter release time.
112
113* Tuning power
114
115power_level - Sets the output power level for signal transmission.
116antenna_capacitor - This selects the value of antenna tuning capacitor manually
117or automatically if set to zero.
118
119* RDS related
120
121rds_ps_name - Sets the RDS ps name field for transmission.
122rds_radio_text - Sets the RDS radio text for transmission.
123rds_pi - Sets the RDS PI field for transmission.
124rds_pty - Sets the RDS PTY field for transmission.
125
126* Region related
127
128preemphasis - sets the preemphasis to be applied for transmission.
129
130RNL
131===
132
133This device also has an interface to measure received noise level. To do that, you should
134ioctl the device node. Here is an code of example:
135
136int main (int argc, char *argv[])
137{
138 struct si4713_rnl rnl;
139 int fd = open("/dev/radio0", O_RDWR);
140 int rval;
141
142 if (argc < 2)
143 return -EINVAL;
144
145 if (fd < 0)
146 return fd;
147
148 sscanf(argv[1], "%d", &rnl.frequency);
149
150 rval = ioctl(fd, SI4713_IOC_MEASURE_RNL, &rnl);
151 if (rval < 0)
152 return rval;
153
154 printf("received noise level: %d\n", rnl.rnl);
155
156 close(fd);
157}
158
159The struct si4713_rnl and SI4713_IOC_MEASURE_RNL are defined under
160include/media/si4713.h.
161
162Stereo/Mono and RDS subchannels
163===============================
164
165The device can also be configured using the available sub channels for
166transmission. To do that use S/G_MODULATOR ioctl and configure txsubchans properly.
167Refer to v4l2-spec for proper use of this ioctl.
168
169Testing
170=======
171Testing is usually done with v4l2-ctl utility for managing FM tuner cards.
172The tool can be found in v4l-dvb repository under v4l2-apps/util directory.
173
174Example for setting rds ps name:
175# v4l2-ctl -d /dev/radio0 --set-ctrl=rds_ps_name="Dummy"
176
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 854808b67fae..ba4706afc5fb 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -89,6 +89,11 @@ from dev (driver name followed by the bus_id, to be precise). If you set it
89up before calling v4l2_device_register then it will be untouched. If dev is 89up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. 90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
91 91
92You can use v4l2_device_set_name() to set the name based on a driver name and
93a driver-global atomic_t instance. This will generate names like ivtv0, ivtv1,
94etc. If the name ends with a digit, then it will insert a dash: cx18-0,
95cx18-1, etc. This function returns the instance number.
96
92The first 'dev' argument is normally the struct device pointer of a pci_dev, 97The first 'dev' argument is normally the struct device pointer of a pci_dev,
93usb_interface or platform_device. It is rare for dev to be NULL, but it happens 98usb_interface or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making 99with ISA devices or when one device creates multiple PCI devices, thus making
@@ -385,6 +390,30 @@ later date. It differs between i2c drivers and as such can be confusing.
385To see which chip variants are supported you can look in the i2c driver code 390To see which chip variants are supported you can look in the i2c driver code
386for the i2c_device_id table. This lists all the possibilities. 391for the i2c_device_id table. This lists all the possibilities.
387 392
393There are two more helper functions:
394
395v4l2_i2c_new_subdev_cfg: this function adds new irq and platform_data
396arguments and has both 'addr' and 'probed_addrs' arguments: if addr is not
3970 then that will be used (non-probing variant), otherwise the probed_addrs
398are probed.
399
400For example: this will probe for address 0x10:
401
402struct v4l2_subdev *sd = v4l2_i2c_new_subdev_cfg(v4l2_dev, adapter,
403 "module_foo", "chipid", 0, NULL, 0, I2C_ADDRS(0x10));
404
405v4l2_i2c_new_subdev_board uses an i2c_board_info struct which is passed
406to the i2c driver and replaces the irq, platform_data and addr arguments.
407
408If the subdev supports the s_config core ops, then that op is called with
409the irq and platform_data arguments after the subdev was setup. The older
410v4l2_i2c_new_(probed_)subdev functions will call s_config as well, but with
411irq set to 0 and platform_data set to NULL.
412
413Note that in the next kernel release the functions v4l2_i2c_new_subdev,
414v4l2_i2c_new_probed_subdev and v4l2_i2c_new_probed_subdev_addr will all be
415replaced by a single v4l2_i2c_new_subdev that is identical to
416v4l2_i2c_new_subdev_cfg but without the irq and platform_data arguments.
388 417
389struct video_device 418struct video_device
390------------------- 419-------------------
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 6f562f778b28..5bd269b3731a 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
2obj- := dummy.o 2obj- := dummy.o
3 3
4# List of programs to build 4# List of programs to build
5hostprogs-y := slabinfo 5hostprogs-y := slabinfo page-types
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
index bd3d31bc4915..c46e68cf9344 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would
75alleviate memory pressure on any zone in the page's node that has fallen below 75alleviate memory pressure on any zone in the page's node that has fallen below
76its watermark. 76its watermark.
77 77
78pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are 78watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
79per-zone fields, used to determine when a zone needs to be balanced. When 79are per-zone fields, used to determine when a zone needs to be balanced. When
80the number of pages falls below pages_min, the hysteric field low_on_memory 80the number of pages falls below watermark[WMARK_MIN], the hysteric field
81gets set. This stays set till the number of free pages becomes pages_high. 81low_on_memory gets set. This stays set till the number of free pages becomes
82When low_on_memory is set, page allocation requests will try to free some 82watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
83pages in the zone (providing GFP_WAIT is set in the request). Orthogonal 83try to free some pages in the zone (providing GFP_WAIT is set in the request).
84to this, is the decision to poke kswapd to free some zone pages. That 84Orthogonal to this, is the decision to poke kswapd to free some zone pages.
85decision is not hysteresis based, and is done when the number of free 85That decision is not hysteresis based, and is done when the number of free
86pages is below pages_low; in which case zone_wake_kswapd is also set. 86pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
87 87
88 88
89(Good) Ideas that I have heard: 89(Good) Ideas that I have heard:
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
new file mode 100644
index 000000000000..0833f44ba16b
--- /dev/null
+++ b/Documentation/vm/page-types.c
@@ -0,0 +1,698 @@
1/*
2 * page-types: Tool for querying page flags
3 *
4 * Copyright (C) 2009 Intel corporation
5 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
6 */
7
8#include <stdio.h>
9#include <stdlib.h>
10#include <unistd.h>
11#include <stdint.h>
12#include <stdarg.h>
13#include <string.h>
14#include <getopt.h>
15#include <limits.h>
16#include <sys/types.h>
17#include <sys/errno.h>
18#include <sys/fcntl.h>
19
20
21/*
22 * kernel page flags
23 */
24
25#define KPF_BYTES 8
26#define PROC_KPAGEFLAGS "/proc/kpageflags"
27
28/* copied from kpageflags_read() */
29#define KPF_LOCKED 0
30#define KPF_ERROR 1
31#define KPF_REFERENCED 2
32#define KPF_UPTODATE 3
33#define KPF_DIRTY 4
34#define KPF_LRU 5
35#define KPF_ACTIVE 6
36#define KPF_SLAB 7
37#define KPF_WRITEBACK 8
38#define KPF_RECLAIM 9
39#define KPF_BUDDY 10
40
41/* [11-20] new additions in 2.6.31 */
42#define KPF_MMAP 11
43#define KPF_ANON 12
44#define KPF_SWAPCACHE 13
45#define KPF_SWAPBACKED 14
46#define KPF_COMPOUND_HEAD 15
47#define KPF_COMPOUND_TAIL 16
48#define KPF_HUGE 17
49#define KPF_UNEVICTABLE 18
50#define KPF_NOPAGE 20
51
52/* [32-] kernel hacking assistances */
53#define KPF_RESERVED 32
54#define KPF_MLOCKED 33
55#define KPF_MAPPEDTODISK 34
56#define KPF_PRIVATE 35
57#define KPF_PRIVATE_2 36
58#define KPF_OWNER_PRIVATE 37
59#define KPF_ARCH 38
60#define KPF_UNCACHED 39
61
62/* [48-] take some arbitrary free slots for expanding overloaded flags
63 * not part of kernel API
64 */
65#define KPF_READAHEAD 48
66#define KPF_SLOB_FREE 49
67#define KPF_SLUB_FROZEN 50
68#define KPF_SLUB_DEBUG 51
69
70#define KPF_ALL_BITS ((uint64_t)~0ULL)
71#define KPF_HACKERS_BITS (0xffffULL << 32)
72#define KPF_OVERLOADED_BITS (0xffffULL << 48)
73#define BIT(name) (1ULL << KPF_##name)
74#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
75
76static char *page_flag_names[] = {
77 [KPF_LOCKED] = "L:locked",
78 [KPF_ERROR] = "E:error",
79 [KPF_REFERENCED] = "R:referenced",
80 [KPF_UPTODATE] = "U:uptodate",
81 [KPF_DIRTY] = "D:dirty",
82 [KPF_LRU] = "l:lru",
83 [KPF_ACTIVE] = "A:active",
84 [KPF_SLAB] = "S:slab",
85 [KPF_WRITEBACK] = "W:writeback",
86 [KPF_RECLAIM] = "I:reclaim",
87 [KPF_BUDDY] = "B:buddy",
88
89 [KPF_MMAP] = "M:mmap",
90 [KPF_ANON] = "a:anonymous",
91 [KPF_SWAPCACHE] = "s:swapcache",
92 [KPF_SWAPBACKED] = "b:swapbacked",
93 [KPF_COMPOUND_HEAD] = "H:compound_head",
94 [KPF_COMPOUND_TAIL] = "T:compound_tail",
95 [KPF_HUGE] = "G:huge",
96 [KPF_UNEVICTABLE] = "u:unevictable",
97 [KPF_NOPAGE] = "n:nopage",
98
99 [KPF_RESERVED] = "r:reserved",
100 [KPF_MLOCKED] = "m:mlocked",
101 [KPF_MAPPEDTODISK] = "d:mappedtodisk",
102 [KPF_PRIVATE] = "P:private",
103 [KPF_PRIVATE_2] = "p:private_2",
104 [KPF_OWNER_PRIVATE] = "O:owner_private",
105 [KPF_ARCH] = "h:arch",
106 [KPF_UNCACHED] = "c:uncached",
107
108 [KPF_READAHEAD] = "I:readahead",
109 [KPF_SLOB_FREE] = "P:slob_free",
110 [KPF_SLUB_FROZEN] = "A:slub_frozen",
111 [KPF_SLUB_DEBUG] = "E:slub_debug",
112};
113
114
115/*
116 * data structures
117 */
118
119static int opt_raw; /* for kernel developers */
120static int opt_list; /* list pages (in ranges) */
121static int opt_no_summary; /* don't show summary */
122static pid_t opt_pid; /* process to walk */
123
124#define MAX_ADDR_RANGES 1024
125static int nr_addr_ranges;
126static unsigned long opt_offset[MAX_ADDR_RANGES];
127static unsigned long opt_size[MAX_ADDR_RANGES];
128
129#define MAX_BIT_FILTERS 64
130static int nr_bit_filters;
131static uint64_t opt_mask[MAX_BIT_FILTERS];
132static uint64_t opt_bits[MAX_BIT_FILTERS];
133
134static int page_size;
135
136#define PAGES_BATCH (64 << 10) /* 64k pages */
137static int kpageflags_fd;
138static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
139
140#define HASH_SHIFT 13
141#define HASH_SIZE (1 << HASH_SHIFT)
142#define HASH_MASK (HASH_SIZE - 1)
143#define HASH_KEY(flags) (flags & HASH_MASK)
144
145static unsigned long total_pages;
146static unsigned long nr_pages[HASH_SIZE];
147static uint64_t page_flags[HASH_SIZE];
148
149
150/*
151 * helper functions
152 */
153
154#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
155
156#define min_t(type, x, y) ({ \
157 type __min1 = (x); \
158 type __min2 = (y); \
159 __min1 < __min2 ? __min1 : __min2; })
160
161unsigned long pages2mb(unsigned long pages)
162{
163 return (pages * page_size) >> 20;
164}
165
166void fatal(const char *x, ...)
167{
168 va_list ap;
169
170 va_start(ap, x);
171 vfprintf(stderr, x, ap);
172 va_end(ap);
173 exit(EXIT_FAILURE);
174}
175
176
177/*
178 * page flag names
179 */
180
181char *page_flag_name(uint64_t flags)
182{
183 static char buf[65];
184 int present;
185 int i, j;
186
187 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
188 present = (flags >> i) & 1;
189 if (!page_flag_names[i]) {
190 if (present)
191 fatal("unkown flag bit %d\n", i);
192 continue;
193 }
194 buf[j++] = present ? page_flag_names[i][0] : '_';
195 }
196
197 return buf;
198}
199
200char *page_flag_longname(uint64_t flags)
201{
202 static char buf[1024];
203 int i, n;
204
205 for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
206 if (!page_flag_names[i])
207 continue;
208 if ((flags >> i) & 1)
209 n += snprintf(buf + n, sizeof(buf) - n, "%s,",
210 page_flag_names[i] + 2);
211 }
212 if (n)
213 n--;
214 buf[n] = '\0';
215
216 return buf;
217}
218
219
220/*
221 * page list and summary
222 */
223
224void show_page_range(unsigned long offset, uint64_t flags)
225{
226 static uint64_t flags0;
227 static unsigned long index;
228 static unsigned long count;
229
230 if (flags == flags0 && offset == index + count) {
231 count++;
232 return;
233 }
234
235 if (count)
236 printf("%lu\t%lu\t%s\n",
237 index, count, page_flag_name(flags0));
238
239 flags0 = flags;
240 index = offset;
241 count = 1;
242}
243
244void show_page(unsigned long offset, uint64_t flags)
245{
246 printf("%lu\t%s\n", offset, page_flag_name(flags));
247}
248
249void show_summary(void)
250{
251 int i;
252
253 printf(" flags\tpage-count MB"
254 " symbolic-flags\t\t\tlong-symbolic-flags\n");
255
256 for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
257 if (nr_pages[i])
258 printf("0x%016llx\t%10lu %8lu %s\t%s\n",
259 (unsigned long long)page_flags[i],
260 nr_pages[i],
261 pages2mb(nr_pages[i]),
262 page_flag_name(page_flags[i]),
263 page_flag_longname(page_flags[i]));
264 }
265
266 printf(" total\t%10lu %8lu\n",
267 total_pages, pages2mb(total_pages));
268}
269
270
271/*
272 * page flag filters
273 */
274
275int bit_mask_ok(uint64_t flags)
276{
277 int i;
278
279 for (i = 0; i < nr_bit_filters; i++) {
280 if (opt_bits[i] == KPF_ALL_BITS) {
281 if ((flags & opt_mask[i]) == 0)
282 return 0;
283 } else {
284 if ((flags & opt_mask[i]) != opt_bits[i])
285 return 0;
286 }
287 }
288
289 return 1;
290}
291
292uint64_t expand_overloaded_flags(uint64_t flags)
293{
294 /* SLOB/SLUB overload several page flags */
295 if (flags & BIT(SLAB)) {
296 if (flags & BIT(PRIVATE))
297 flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
298 if (flags & BIT(ACTIVE))
299 flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
300 if (flags & BIT(ERROR))
301 flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
302 }
303
304 /* PG_reclaim is overloaded as PG_readahead in the read path */
305 if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
306 flags ^= BIT(RECLAIM) | BIT(READAHEAD);
307
308 return flags;
309}
310
311uint64_t well_known_flags(uint64_t flags)
312{
313 /* hide flags intended only for kernel hacker */
314 flags &= ~KPF_HACKERS_BITS;
315
316 /* hide non-hugeTLB compound pages */
317 if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
318 flags &= ~BITS_COMPOUND;
319
320 return flags;
321}
322
323
324/*
325 * page frame walker
326 */
327
328int hash_slot(uint64_t flags)
329{
330 int k = HASH_KEY(flags);
331 int i;
332
333 /* Explicitly reserve slot 0 for flags 0: the following logic
334 * cannot distinguish an unoccupied slot from slot (flags==0).
335 */
336 if (flags == 0)
337 return 0;
338
339 /* search through the remaining (HASH_SIZE-1) slots */
340 for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
341 if (!k || k >= ARRAY_SIZE(page_flags))
342 k = 1;
343 if (page_flags[k] == 0) {
344 page_flags[k] = flags;
345 return k;
346 }
347 if (page_flags[k] == flags)
348 return k;
349 }
350
351 fatal("hash table full: bump up HASH_SHIFT?\n");
352 exit(EXIT_FAILURE);
353}
354
355void add_page(unsigned long offset, uint64_t flags)
356{
357 flags = expand_overloaded_flags(flags);
358
359 if (!opt_raw)
360 flags = well_known_flags(flags);
361
362 if (!bit_mask_ok(flags))
363 return;
364
365 if (opt_list == 1)
366 show_page_range(offset, flags);
367 else if (opt_list == 2)
368 show_page(offset, flags);
369
370 nr_pages[hash_slot(flags)]++;
371 total_pages++;
372}
373
374void walk_pfn(unsigned long index, unsigned long count)
375{
376 unsigned long batch;
377 unsigned long n;
378 unsigned long i;
379
380 if (index > ULONG_MAX / KPF_BYTES)
381 fatal("index overflow: %lu\n", index);
382
383 lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
384
385 while (count) {
386 batch = min_t(unsigned long, count, PAGES_BATCH);
387 n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
388 if (n == 0)
389 break;
390 if (n < 0) {
391 perror(PROC_KPAGEFLAGS);
392 exit(EXIT_FAILURE);
393 }
394
395 if (n % KPF_BYTES != 0)
396 fatal("partial read: %lu bytes\n", n);
397 n = n / KPF_BYTES;
398
399 for (i = 0; i < n; i++)
400 add_page(index + i, kpageflags_buf[i]);
401
402 index += batch;
403 count -= batch;
404 }
405}
406
407void walk_addr_ranges(void)
408{
409 int i;
410
411 kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY);
412 if (kpageflags_fd < 0) {
413 perror(PROC_KPAGEFLAGS);
414 exit(EXIT_FAILURE);
415 }
416
417 if (!nr_addr_ranges)
418 walk_pfn(0, ULONG_MAX);
419
420 for (i = 0; i < nr_addr_ranges; i++)
421 walk_pfn(opt_offset[i], opt_size[i]);
422
423 close(kpageflags_fd);
424}
425
426
427/*
428 * user interface
429 */
430
431const char *page_flag_type(uint64_t flag)
432{
433 if (flag & KPF_HACKERS_BITS)
434 return "(r)";
435 if (flag & KPF_OVERLOADED_BITS)
436 return "(o)";
437 return " ";
438}
439
440void usage(void)
441{
442 int i, j;
443
444 printf(
445"page-types [options]\n"
446" -r|--raw Raw mode, for kernel developers\n"
447" -a|--addr addr-spec Walk a range of pages\n"
448" -b|--bits bits-spec Walk pages with specified bits\n"
449#if 0 /* planned features */
450" -p|--pid pid Walk process address space\n"
451" -f|--file filename Walk file address space\n"
452#endif
453" -l|--list Show page details in ranges\n"
454" -L|--list-each Show page details one by one\n"
455" -N|--no-summary Don't show summay info\n"
456" -h|--help Show this usage message\n"
457"addr-spec:\n"
458" N one page at offset N (unit: pages)\n"
459" N+M pages range from N to N+M-1\n"
460" N,M pages range from N to M-1\n"
461" N, pages range from N to end\n"
462" ,M pages range from 0 to M\n"
463"bits-spec:\n"
464" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
465" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
466" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
467" =bit1,bit2 flags == (bit1|bit2)\n"
468"bit-names:\n"
469 );
470
471 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
472 if (!page_flag_names[i])
473 continue;
474 printf("%16s%s", page_flag_names[i] + 2,
475 page_flag_type(1ULL << i));
476 if (++j > 3) {
477 j = 0;
478 putchar('\n');
479 }
480 }
481 printf("\n "
482 "(r) raw mode bits (o) overloaded bits\n");
483}
484
485unsigned long long parse_number(const char *str)
486{
487 unsigned long long n;
488
489 n = strtoll(str, NULL, 0);
490
491 if (n == 0 && str[0] != '0')
492 fatal("invalid name or number: %s\n", str);
493
494 return n;
495}
496
497void parse_pid(const char *str)
498{
499 opt_pid = parse_number(str);
500}
501
502void parse_file(const char *name)
503{
504}
505
506void add_addr_range(unsigned long offset, unsigned long size)
507{
508 if (nr_addr_ranges >= MAX_ADDR_RANGES)
509 fatal("too much addr ranges\n");
510
511 opt_offset[nr_addr_ranges] = offset;
512 opt_size[nr_addr_ranges] = size;
513 nr_addr_ranges++;
514}
515
516void parse_addr_range(const char *optarg)
517{
518 unsigned long offset;
519 unsigned long size;
520 char *p;
521
522 p = strchr(optarg, ',');
523 if (!p)
524 p = strchr(optarg, '+');
525
526 if (p == optarg) {
527 offset = 0;
528 size = parse_number(p + 1);
529 } else if (p) {
530 offset = parse_number(optarg);
531 if (p[1] == '\0')
532 size = ULONG_MAX;
533 else {
534 size = parse_number(p + 1);
535 if (*p == ',') {
536 if (size < offset)
537 fatal("invalid range: %lu,%lu\n",
538 offset, size);
539 size -= offset;
540 }
541 }
542 } else {
543 offset = parse_number(optarg);
544 size = 1;
545 }
546
547 add_addr_range(offset, size);
548}
549
550void add_bits_filter(uint64_t mask, uint64_t bits)
551{
552 if (nr_bit_filters >= MAX_BIT_FILTERS)
553 fatal("too much bit filters\n");
554
555 opt_mask[nr_bit_filters] = mask;
556 opt_bits[nr_bit_filters] = bits;
557 nr_bit_filters++;
558}
559
560uint64_t parse_flag_name(const char *str, int len)
561{
562 int i;
563
564 if (!*str || !len)
565 return 0;
566
567 if (len <= 8 && !strncmp(str, "compound", len))
568 return BITS_COMPOUND;
569
570 for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
571 if (!page_flag_names[i])
572 continue;
573 if (!strncmp(str, page_flag_names[i] + 2, len))
574 return 1ULL << i;
575 }
576
577 return parse_number(str);
578}
579
580uint64_t parse_flag_names(const char *str, int all)
581{
582 const char *p = str;
583 uint64_t flags = 0;
584
585 while (1) {
586 if (*p == ',' || *p == '=' || *p == '\0') {
587 if ((*str != '~') || (*str == '~' && all && *++str))
588 flags |= parse_flag_name(str, p - str);
589 if (*p != ',')
590 break;
591 str = p + 1;
592 }
593 p++;
594 }
595
596 return flags;
597}
598
599void parse_bits_mask(const char *optarg)
600{
601 uint64_t mask;
602 uint64_t bits;
603 const char *p;
604
605 p = strchr(optarg, '=');
606 if (p == optarg) {
607 mask = KPF_ALL_BITS;
608 bits = parse_flag_names(p + 1, 0);
609 } else if (p) {
610 mask = parse_flag_names(optarg, 0);
611 bits = parse_flag_names(p + 1, 0);
612 } else if (strchr(optarg, '~')) {
613 mask = parse_flag_names(optarg, 1);
614 bits = parse_flag_names(optarg, 0);
615 } else {
616 mask = parse_flag_names(optarg, 0);
617 bits = KPF_ALL_BITS;
618 }
619
620 add_bits_filter(mask, bits);
621}
622
623
624struct option opts[] = {
625 { "raw" , 0, NULL, 'r' },
626 { "pid" , 1, NULL, 'p' },
627 { "file" , 1, NULL, 'f' },
628 { "addr" , 1, NULL, 'a' },
629 { "bits" , 1, NULL, 'b' },
630 { "list" , 0, NULL, 'l' },
631 { "list-each" , 0, NULL, 'L' },
632 { "no-summary", 0, NULL, 'N' },
633 { "help" , 0, NULL, 'h' },
634 { NULL , 0, NULL, 0 }
635};
636
637int main(int argc, char *argv[])
638{
639 int c;
640
641 page_size = getpagesize();
642
643 while ((c = getopt_long(argc, argv,
644 "rp:f:a:b:lLNh", opts, NULL)) != -1) {
645 switch (c) {
646 case 'r':
647 opt_raw = 1;
648 break;
649 case 'p':
650 parse_pid(optarg);
651 break;
652 case 'f':
653 parse_file(optarg);
654 break;
655 case 'a':
656 parse_addr_range(optarg);
657 break;
658 case 'b':
659 parse_bits_mask(optarg);
660 break;
661 case 'l':
662 opt_list = 1;
663 break;
664 case 'L':
665 opt_list = 2;
666 break;
667 case 'N':
668 opt_no_summary = 1;
669 break;
670 case 'h':
671 usage();
672 exit(0);
673 default:
674 usage();
675 exit(1);
676 }
677 }
678
679 if (opt_list == 1)
680 printf("offset\tcount\tflags\n");
681 if (opt_list == 2)
682 printf("offset\tflags\n");
683
684 walk_addr_ranges();
685
686 if (opt_list == 1)
687 show_page_range(0, 0); /* drain the buffer */
688
689 if (opt_no_summary)
690 return 0;
691
692 if (opt_list)
693 printf("\n\n");
694
695 show_summary();
696
697 return 0;
698}
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index ce72c0fe6177..600a304a828c 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -12,9 +12,9 @@ There are three components to pagemap:
12 value for each virtual page, containing the following data (from 12 value for each virtual page, containing the following data (from
13 fs/proc/task_mmu.c, above pagemap_read): 13 fs/proc/task_mmu.c, above pagemap_read):
14 14
15 * Bits 0-55 page frame number (PFN) if present 15 * Bits 0-54 page frame number (PFN) if present
16 * Bits 0-4 swap type if swapped 16 * Bits 0-4 swap type if swapped
17 * Bits 5-55 swap offset if swapped 17 * Bits 5-54 swap offset if swapped
18 * Bits 55-60 page shift (page size = 1<<page shift) 18 * Bits 55-60 page shift (page size = 1<<page shift)
19 * Bit 61 reserved for future use 19 * Bit 61 reserved for future use
20 * Bit 62 page swapped 20 * Bit 62 page swapped
@@ -36,7 +36,7 @@ There are three components to pagemap:
36 * /proc/kpageflags. This file contains a 64-bit set of flags for each 36 * /proc/kpageflags. This file contains a 64-bit set of flags for each
37 page, indexed by PFN. 37 page, indexed by PFN.
38 38
39 The flags are (from fs/proc/proc_misc, above kpageflags_read): 39 The flags are (from fs/proc/page.c, above kpageflags_read):
40 40
41 0. LOCKED 41 0. LOCKED
42 1. ERROR 42 1. ERROR
@@ -49,6 +49,68 @@ There are three components to pagemap:
49 8. WRITEBACK 49 8. WRITEBACK
50 9. RECLAIM 50 9. RECLAIM
51 10. BUDDY 51 10. BUDDY
52 11. MMAP
53 12. ANON
54 13. SWAPCACHE
55 14. SWAPBACKED
56 15. COMPOUND_HEAD
57 16. COMPOUND_TAIL
58 16. HUGE
59 18. UNEVICTABLE
60 20. NOPAGE
61
62Short descriptions to the page flags:
63
64 0. LOCKED
65 page is being locked for exclusive access, eg. by undergoing read/write IO
66
67 7. SLAB
68 page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
69 When compound page is used, SLUB/SLQB will only set this flag on the head
70 page; SLOB will not flag it at all.
71
7210. BUDDY
73 a free memory block managed by the buddy system allocator
74 The buddy system organizes free memory in blocks of various orders.
75 An order N block has 2^N physically contiguous pages, with the BUDDY flag
76 set for and _only_ for the first page.
77
7815. COMPOUND_HEAD
7916. COMPOUND_TAIL
80 A compound page with order N consists of 2^N physically contiguous pages.
81 A compound page with order 2 takes the form of "HTTT", where H donates its
82 head page and T donates its tail page(s). The major consumers of compound
83 pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
84 memory allocators and various device drivers. However in this interface,
85 only huge/giga pages are made visible to end users.
8617. HUGE
87 this is an integral part of a HugeTLB page
88
8920. NOPAGE
90 no page frame exists at the requested address
91
92 [IO related page flags]
93 1. ERROR IO error occurred
94 3. UPTODATE page has up-to-date data
95 ie. for file backed page: (in-memory data revision >= on-disk one)
96 4. DIRTY page has been written to, hence contains new data
97 ie. for file backed page: (in-memory data revision > on-disk one)
98 8. WRITEBACK page is being synced to disk
99
100 [LRU related page flags]
101 5. LRU page is in one of the LRU lists
102 6. ACTIVE page is in the active LRU list
10318. UNEVICTABLE page is in the unevictable (non-)LRU list
104 It is somehow pinned and not a candidate for LRU page reclaims,
105 eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
106 2. REFERENCED page has been referenced since last LRU list enqueue/requeue
107 9. RECLAIM page will be reclaimed soon after its pageout IO completed
10811. MMAP a memory mapped page
10912. ANON a memory mapped page that is not part of a file
11013. SWAPCACHE page is mapped to swap space, ie. has an associated swap entry
11114. SWAPBACKED page is backed by swap/RAM
112
113The page-types tool in this directory can be used to query the above flags.
52 114
53Using pagemap to do something useful: 115Using pagemap to do something useful:
54 116
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
index bb1f5c6e28b3..510917ff59ed 100644
--- a/Documentation/vm/slub.txt
+++ b/Documentation/vm/slub.txt
@@ -41,6 +41,8 @@ Possible debug options are
41 P Poisoning (object and padding) 41 P Poisoning (object and padding)
42 U User tracking (free and alloc) 42 U User tracking (free and alloc)
43 T Trace (please only use on single slabs) 43 T Trace (please only use on single slabs)
44 O Switch debugging off for caches that would have
45 caused higher minimum slab orders
44 - Switch all debugging off (useful if the kernel is 46 - Switch all debugging off (useful if the kernel is
45 configured with CONFIG_SLUB_DEBUG_ON) 47 configured with CONFIG_SLUB_DEBUG_ON)
46 48
@@ -59,6 +61,14 @@ to the dentry cache with
59 61
60 slub_debug=F,dentry 62 slub_debug=F,dentry
61 63
64Debugging options may require the minimum possible slab order to increase as
65a result of storing the metadata (for example, caches with PAGE_SIZE object
66sizes). This has a higher liklihood of resulting in slab allocation errors
67in low memory situations or if there's high fragmentation of memory. To
68switch off debugging for such caches by default, use
69
70 slub_debug=O
71
62In case you forgot to enable debugging on the kernel command line: It is 72In case you forgot to enable debugging on the kernel command line: It is
63possible to enable debugging manually when the kernel is up. Look at the 73possible to enable debugging manually when the kernel is up. Look at the
64contents of: 74contents of:
diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt
new file mode 100644
index 000000000000..9c24d5ffbb06
--- /dev/null
+++ b/Documentation/watchdog/hpwdt.txt
@@ -0,0 +1,95 @@
1Last reviewed: 06/02/2009
2
3 HP iLO2 NMI Watchdog Driver
4 NMI sourcing for iLO2 based ProLiant Servers
5 Documentation and Driver by
6 Thomas Mingarelli <thomas.mingarelli@hp.com>
7
8 The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
9 watchdog functionality and the added benefit of NMI sourcing. Both the
10 watchdog functionality and the NMI sourcing capability need to be enabled
11 by the user. Remember that the two modes are not dependant on one another.
12 A user can have the NMI sourcing without the watchdog timer and vice-versa.
13
14 Watchdog functionality is enabled like any other common watchdog driver. That
15 is, an application needs to be started that kicks off the watchdog timer. A
16 basic application exists in the Documentation/watchdog/src directory called
17 watchdog-test.c. Simply compile the C file and kick it off. If the system
18 gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
19 not be updated in a timely fashion and a hardware system reset (also known as
20 an Automatic Server Recovery (ASR)) event will occur.
21
22 The hpwdt driver also has four (4) module parameters. They are the following:
23
24 soft_margin - allows the user to set the watchdog timer value
25 allow_kdump - allows the user to save off a kernel dump image after an NMI
26 nowayout - basic watchdog parameter that does not allow the timer to
27 be restarted or an impending ASR to be escaped.
28 priority - determines whether or not the hpwdt driver is first on the
29 die_notify list to handle NMIs or last. The default value
30 for this module parameter is 0 or LAST. If the user wants to
31 enable NMI sourcing then reload the hpwdt driver with
32 priority=1 (and boot with nmi_watchdog=0).
33
34 NOTE: More information about watchdog drivers in general, including the ioctl
35 interface to /dev/watchdog can be found in
36 Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
37
38 The priority parameter was introduced due to other kernel software that relied
39 on handling NMIs (like oprofile). Keeping hpwdt's priority at 0 (or LAST)
40 enables the users of NMIs for non critical events to be work as expected.
41
42 The NMI sourcing capability is disabled by default due to the inability to
43 distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
44 Linux kernel. What this means is that the hpwdt nmi handler code is called
45 each time the NMI signal fires off. This could amount to several thousands of
46 NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
47 confused" message in the logs or if the system gets into a hung state, then
48 the hpwdt driver can be reloaded with the "priority" module parameter set
49 (priority=1).
50
51 1. If the kernel has not been booted with nmi_watchdog turned off then
52 edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
53 currently booting kernel line.
54 2. reboot the sever
55 3. Once the system comes up perform a rmmod hpwdt
56 4. insmod /lib/modules/`uname -r`/kernel/drivers/char/watchdog/hpwdt.ko priority=1
57
58 Now, the hpwdt can successfully receive and source the NMI and provide a log
59 message that details the reason for the NMI (as determined by the HP BIOS).
60
61 Below is a list of NMIs the HP BIOS understands along with the associated
62 code (reason):
63
64 No source found 00h
65
66 Uncorrectable Memory Error 01h
67
68 ASR NMI 1Bh
69
70 PCI Parity Error 20h
71
72 NMI Button Press 27h
73
74 SB_BUS_NMI 28h
75
76 ILO Doorbell NMI 29h
77
78 ILO IOP NMI 2Ah
79
80 ILO Watchdog NMI 2Bh
81
82 Proc Throt NMI 2Ch
83
84 Front Side Bus NMI 2Dh
85
86 PCI Express Error 2Fh
87
88 DMA controller NMI 30h
89
90 Hypertransport/CSI Error 31h
91
92
93
94 -- Tom Mingarelli
95 (thomas.mingarelli@hp.com)
diff --git a/Documentation/x86/00-INDEX b/Documentation/x86/00-INDEX
index dbe3377754af..f37b46d34861 100644
--- a/Documentation/x86/00-INDEX
+++ b/Documentation/x86/00-INDEX
@@ -2,3 +2,5 @@
2 - this file 2 - this file
3mtrr.txt 3mtrr.txt
4 - how to use x86 Memory Type Range Registers to increase performance 4 - how to use x86 Memory Type Range Registers to increase performance
5exception-tables.txt
6 - why and how Linux kernel uses exception tables on x86
diff --git a/Documentation/exception.txt b/Documentation/x86/exception-tables.txt
index 2d5aded64247..32901aa36f0a 100644
--- a/Documentation/exception.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -1,123 +1,123 @@
1 Kernel level exception handling in Linux 2.1.8 1 Kernel level exception handling in Linux
2 Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com> 2 Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com>
3 3
4When a process runs in kernel mode, it often has to access user 4When a process runs in kernel mode, it often has to access user
5mode memory whose address has been passed by an untrusted program. 5mode memory whose address has been passed by an untrusted program.
6To protect itself the kernel has to verify this address. 6To protect itself the kernel has to verify this address.
7 7
8In older versions of Linux this was done with the 8In older versions of Linux this was done with the
9int verify_area(int type, const void * addr, unsigned long size) 9int verify_area(int type, const void * addr, unsigned long size)
10function (which has since been replaced by access_ok()). 10function (which has since been replaced by access_ok()).
11 11
12This function verified that the memory area starting at address 12This function verified that the memory area starting at address
13'addr' and of size 'size' was accessible for the operation specified 13'addr' and of size 'size' was accessible for the operation specified
14in type (read or write). To do this, verify_read had to look up the 14in type (read or write). To do this, verify_read had to look up the
15virtual memory area (vma) that contained the address addr. In the 15virtual memory area (vma) that contained the address addr. In the
16normal case (correctly working program), this test was successful. 16normal case (correctly working program), this test was successful.
17It only failed for a few buggy programs. In some kernel profiling 17It only failed for a few buggy programs. In some kernel profiling
18tests, this normally unneeded verification used up a considerable 18tests, this normally unneeded verification used up a considerable
19amount of time. 19amount of time.
20 20
21To overcome this situation, Linus decided to let the virtual memory 21To overcome this situation, Linus decided to let the virtual memory
22hardware present in every Linux-capable CPU handle this test. 22hardware present in every Linux-capable CPU handle this test.
23 23
24How does this work? 24How does this work?
25 25
26Whenever the kernel tries to access an address that is currently not 26Whenever the kernel tries to access an address that is currently not
27accessible, the CPU generates a page fault exception and calls the 27accessible, the CPU generates a page fault exception and calls the
28page fault handler 28page fault handler
29 29
30void do_page_fault(struct pt_regs *regs, unsigned long error_code) 30void do_page_fault(struct pt_regs *regs, unsigned long error_code)
31 31
32in arch/i386/mm/fault.c. The parameters on the stack are set up by 32in arch/x86/mm/fault.c. The parameters on the stack are set up by
33the low level assembly glue in arch/i386/kernel/entry.S. The parameter 33the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter
34regs is a pointer to the saved registers on the stack, error_code 34regs is a pointer to the saved registers on the stack, error_code
35contains a reason code for the exception. 35contains a reason code for the exception.
36 36
37do_page_fault first obtains the unaccessible address from the CPU 37do_page_fault first obtains the unaccessible address from the CPU
38control register CR2. If the address is within the virtual address 38control register CR2. If the address is within the virtual address
39space of the process, the fault probably occurred, because the page 39space of the process, the fault probably occurred, because the page
40was not swapped in, write protected or something similar. However, 40was not swapped in, write protected or something similar. However,
41we are interested in the other case: the address is not valid, there 41we are interested in the other case: the address is not valid, there
42is no vma that contains this address. In this case, the kernel jumps 42is no vma that contains this address. In this case, the kernel jumps
43to the bad_area label. 43to the bad_area label.
44 44
45There it uses the address of the instruction that caused the exception 45There it uses the address of the instruction that caused the exception
46(i.e. regs->eip) to find an address where the execution can continue 46(i.e. regs->eip) to find an address where the execution can continue
47(fixup). If this search is successful, the fault handler modifies the 47(fixup). If this search is successful, the fault handler modifies the
48return address (again regs->eip) and returns. The execution will 48return address (again regs->eip) and returns. The execution will
49continue at the address in fixup. 49continue at the address in fixup.
50 50
51Where does fixup point to? 51Where does fixup point to?
52 52
53Since we jump to the contents of fixup, fixup obviously points 53Since we jump to the contents of fixup, fixup obviously points
54to executable code. This code is hidden inside the user access macros. 54to executable code. This code is hidden inside the user access macros.
55I have picked the get_user macro defined in include/asm/uaccess.h as an 55I have picked the get_user macro defined in arch/x86/include/asm/uaccess.h
56example. The definition is somewhat hard to follow, so let's peek at 56as an example. The definition is somewhat hard to follow, so let's peek at
57the code generated by the preprocessor and the compiler. I selected 57the code generated by the preprocessor and the compiler. I selected
58the get_user call in drivers/char/console.c for a detailed examination. 58the get_user call in drivers/char/sysrq.c for a detailed examination.
59 59
60The original code in console.c line 1405: 60The original code in sysrq.c line 587:
61 get_user(c, buf); 61 get_user(c, buf);
62 62
63The preprocessor output (edited to become somewhat readable): 63The preprocessor output (edited to become somewhat readable):
64 64
65( 65(
66 { 66 {
67 long __gu_err = - 14 , __gu_val = 0; 67 long __gu_err = - 14 , __gu_val = 0;
68 const __typeof__(*( ( buf ) )) *__gu_addr = ((buf)); 68 const __typeof__(*( ( buf ) )) *__gu_addr = ((buf));
69 if (((((0 + current_set[0])->tss.segment) == 0x18 ) || 69 if (((((0 + current_set[0])->tss.segment) == 0x18 ) ||
70 (((sizeof(*(buf))) <= 0xC0000000UL) && 70 (((sizeof(*(buf))) <= 0xC0000000UL) &&
71 ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf))))))) 71 ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf)))))))
72 do { 72 do {
73 __gu_err = 0; 73 __gu_err = 0;
74 switch ((sizeof(*(buf)))) { 74 switch ((sizeof(*(buf)))) {
75 case 1: 75 case 1:
76 __asm__ __volatile__( 76 __asm__ __volatile__(
77 "1: mov" "b" " %2,%" "b" "1\n" 77 "1: mov" "b" " %2,%" "b" "1\n"
78 "2:\n" 78 "2:\n"
79 ".section .fixup,\"ax\"\n" 79 ".section .fixup,\"ax\"\n"
80 "3: movl %3,%0\n" 80 "3: movl %3,%0\n"
81 " xor" "b" " %" "b" "1,%" "b" "1\n" 81 " xor" "b" " %" "b" "1,%" "b" "1\n"
82 " jmp 2b\n" 82 " jmp 2b\n"
83 ".section __ex_table,\"a\"\n" 83 ".section __ex_table,\"a\"\n"
84 " .align 4\n" 84 " .align 4\n"
85 " .long 1b,3b\n" 85 " .long 1b,3b\n"
86 ".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *) 86 ".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *)
87 ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ; 87 ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ;
88 break; 88 break;
89 case 2: 89 case 2:
90 __asm__ __volatile__( 90 __asm__ __volatile__(
91 "1: mov" "w" " %2,%" "w" "1\n" 91 "1: mov" "w" " %2,%" "w" "1\n"
92 "2:\n" 92 "2:\n"
93 ".section .fixup,\"ax\"\n" 93 ".section .fixup,\"ax\"\n"
94 "3: movl %3,%0\n" 94 "3: movl %3,%0\n"
95 " xor" "w" " %" "w" "1,%" "w" "1\n" 95 " xor" "w" " %" "w" "1,%" "w" "1\n"
96 " jmp 2b\n" 96 " jmp 2b\n"
97 ".section __ex_table,\"a\"\n" 97 ".section __ex_table,\"a\"\n"
98 " .align 4\n" 98 " .align 4\n"
99 " .long 1b,3b\n" 99 " .long 1b,3b\n"
100 ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) 100 ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
101 ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )); 101 ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err ));
102 break; 102 break;
103 case 4: 103 case 4:
104 __asm__ __volatile__( 104 __asm__ __volatile__(
105 "1: mov" "l" " %2,%" "" "1\n" 105 "1: mov" "l" " %2,%" "" "1\n"
106 "2:\n" 106 "2:\n"
107 ".section .fixup,\"ax\"\n" 107 ".section .fixup,\"ax\"\n"
108 "3: movl %3,%0\n" 108 "3: movl %3,%0\n"
109 " xor" "l" " %" "" "1,%" "" "1\n" 109 " xor" "l" " %" "" "1,%" "" "1\n"
110 " jmp 2b\n" 110 " jmp 2b\n"
111 ".section __ex_table,\"a\"\n" 111 ".section __ex_table,\"a\"\n"
112 " .align 4\n" " .long 1b,3b\n" 112 " .align 4\n" " .long 1b,3b\n"
113 ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) 113 ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
114 ( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err)); 114 ( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err));
115 break; 115 break;
116 default: 116 default:
117 (__gu_val) = __get_user_bad(); 117 (__gu_val) = __get_user_bad();
118 } 118 }
119 } while (0) ; 119 } while (0) ;
120 ((c)) = (__typeof__(*((buf))))__gu_val; 120 ((c)) = (__typeof__(*((buf))))__gu_val;
121 __gu_err; 121 __gu_err;
122 } 122 }
123); 123);
@@ -127,12 +127,12 @@ see what code gcc generates:
127 127
128 > xorl %edx,%edx 128 > xorl %edx,%edx
129 > movl current_set,%eax 129 > movl current_set,%eax
130 > cmpl $24,788(%eax) 130 > cmpl $24,788(%eax)
131 > je .L1424 131 > je .L1424
132 > cmpl $-1073741825,64(%esp) 132 > cmpl $-1073741825,64(%esp)
133 > ja .L1423 133 > ja .L1423
134 > .L1424: 134 > .L1424:
135 > movl %edx,%eax 135 > movl %edx,%eax
136 > movl 64(%esp),%ebx 136 > movl 64(%esp),%ebx
137 > #APP 137 > #APP
138 > 1: movb (%ebx),%dl /* this is the actual user access */ 138 > 1: movb (%ebx),%dl /* this is the actual user access */
@@ -149,17 +149,17 @@ see what code gcc generates:
149 > .L1423: 149 > .L1423:
150 > movzbl %dl,%esi 150 > movzbl %dl,%esi
151 151
152The optimizer does a good job and gives us something we can actually 152The optimizer does a good job and gives us something we can actually
153understand. Can we? The actual user access is quite obvious. Thanks 153understand. Can we? The actual user access is quite obvious. Thanks
154to the unified address space we can just access the address in user 154to the unified address space we can just access the address in user
155memory. But what does the .section stuff do????? 155memory. But what does the .section stuff do?????
156 156
157To understand this we have to look at the final kernel: 157To understand this we have to look at the final kernel:
158 158
159 > objdump --section-headers vmlinux 159 > objdump --section-headers vmlinux
160 > 160 >
161 > vmlinux: file format elf32-i386 161 > vmlinux: file format elf32-i386
162 > 162 >
163 > Sections: 163 > Sections:
164 > Idx Name Size VMA LMA File off Algn 164 > Idx Name Size VMA LMA File off Algn
165 > 0 .text 00098f40 c0100000 c0100000 00001000 2**4 165 > 0 .text 00098f40 c0100000 c0100000 00001000 2**4
@@ -198,18 +198,18 @@ final kernel executable:
198 198
199The whole user memory access is reduced to 10 x86 machine instructions. 199The whole user memory access is reduced to 10 x86 machine instructions.
200The instructions bracketed in the .section directives are no longer 200The instructions bracketed in the .section directives are no longer
201in the normal execution path. They are located in a different section 201in the normal execution path. They are located in a different section
202of the executable file: 202of the executable file:
203 203
204 > objdump --disassemble --section=.fixup vmlinux 204 > objdump --disassemble --section=.fixup vmlinux
205 > 205 >
206 > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax 206 > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
207 > c0199ffa <.fixup+10ba> xorb %dl,%dl 207 > c0199ffa <.fixup+10ba> xorb %dl,%dl
208 > c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3> 208 > c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3>
209 209
210And finally: 210And finally:
211 > objdump --full-contents --section=__ex_table vmlinux 211 > objdump --full-contents --section=__ex_table vmlinux
212 > 212 >
213 > c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................ 213 > c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................
214 > c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................ 214 > c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................
215 > c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................ 215 > c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................
@@ -235,8 +235,8 @@ sections in the ELF object file. So the instructions
235ended up in the .fixup section of the object file and the addresses 235ended up in the .fixup section of the object file and the addresses
236 .long 1b,3b 236 .long 1b,3b
237ended up in the __ex_table section of the object file. 1b and 3b 237ended up in the __ex_table section of the object file. 1b and 3b
238are local labels. The local label 1b (1b stands for next label 1 238are local labels. The local label 1b (1b stands for next label 1
239backward) is the address of the instruction that might fault, i.e. 239backward) is the address of the instruction that might fault, i.e.
240in our case the address of the label 1 is c017e7a5: 240in our case the address of the label 1 is c017e7a5:
241the original assembly code: > 1: movb (%ebx),%dl 241the original assembly code: > 1: movb (%ebx),%dl
242and linked in vmlinux : > c017e7a5 <do_con_write+e1> movb (%ebx),%dl 242and linked in vmlinux : > c017e7a5 <do_con_write+e1> movb (%ebx),%dl
@@ -254,7 +254,7 @@ The assembly code
254becomes the value pair 254becomes the value pair
255 > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................ 255 > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
256 ^this is ^this is 256 ^this is ^this is
257 1b 3b 257 1b 3b
258c017e7a5,c0199ff5 in the exception table of the kernel. 258c017e7a5,c0199ff5 in the exception table of the kernel.
259 259
260So, what actually happens if a fault from kernel mode with no suitable 260So, what actually happens if a fault from kernel mode with no suitable
@@ -266,9 +266,9 @@ vma occurs?
2663.) CPU calls do_page_fault 2663.) CPU calls do_page_fault
2674.) do page fault calls search_exception_table (regs->eip == c017e7a5); 2674.) do page fault calls search_exception_table (regs->eip == c017e7a5);
2685.) search_exception_table looks up the address c017e7a5 in the 2685.) search_exception_table looks up the address c017e7a5 in the
269 exception table (i.e. the contents of the ELF section __ex_table) 269 exception table (i.e. the contents of the ELF section __ex_table)
270 and returns the address of the associated fault handle code c0199ff5. 270 and returns the address of the associated fault handle code c0199ff5.
2716.) do_page_fault modifies its own return address to point to the fault 2716.) do_page_fault modifies its own return address to point to the fault
272 handle code and returns. 272 handle code and returns.
2737.) execution continues in the fault handling code. 2737.) execution continues in the fault handling code.
2748.) 8a) EAX becomes -EFAULT (== -14) 2748.) 8a) EAX becomes -EFAULT (== -14)
diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 4f913857b8a2..feb37e177010 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -12,6 +12,7 @@ Offset Proto Name Meaning
12000/040 ALL screen_info Text mode or frame buffer information 12000/040 ALL screen_info Text mode or frame buffer information
13 (struct screen_info) 13 (struct screen_info)
14040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) 14040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info)
15058/008 ALL tboot_addr Physical address of tboot shared page
15060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information 16060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information
16 (struct ist_info) 17 (struct ist_info)
17080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! 18080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!!